Skip to content

OOP in Data Science

Understanding the Object-Oriented Design of Common Libraries


Why Do Data Science Libraries Use OOP?

Pandas, Scikit-learn, and Statsmodels are all object-oriented because:

  • Data and methods are naturally bound together
  • Method chaining is more fluent
  • Code is easier to maintain

Pandas' OOP Design

DataFrame Objects

python
import pandas as pd

# DataFrame is a class
df = pd.DataFrame({
    'age': [25, 30, 35, 40],
    'income': [50000, 60000, 75000, 80000]
})

# Attributes
print(df.shape)      # (4, 2)
print(df.columns)    # Index(['age', 'income'])
print(df.dtypes)     # Data types

# Methods (method chaining)
result = (df
    .query('age > 30')           # Filter
    .assign(log_income=lambda x: np.log(x['income']))  # New column
    .sort_values('income')       # Sort
    .reset_index(drop=True)      # Reset index
)

Series Objects

python
# Series is also an object
ages = pd.Series([25, 30, 35, 40], name='age')

# Methods
print(ages.mean())      # 32.5
print(ages.std())       # 6.45
print(ages.quantile(0.5))  # 32.5

# Method chaining
result = (ages
    .apply(lambda x: x ** 2)  # Square
    .sort_values(ascending=False)
    .head(3)
)

Scikit-learn's OOP Design

Model Objects

python
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Create objects
model = LinearRegression()
scaler = StandardScaler()

# Train (fit method)
X = [[1], [2], [3], [4], [5]]
y = [2, 4, 6, 8, 10]

model.fit(X, y)

# Predict (predict method)
predictions = model.predict([[6], [7]])
print(predictions)  # [12. 14.]

# Access attributes
print(model.coef_)       # Coefficients
print(model.intercept_)  # Intercept

Why Use OOP?

python
# Without OOP (hypothetical)
X_scaled = standard_scale(X)
model_params = fit_linear_regression(X_scaled, y)
predictions = predict_linear_regression(model_params, X_test)

# Using OOP (actual)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LinearRegression()
model.fit(X_scaled, y)
predictions = model.predict(X_test_scaled)

Statsmodels' OOP Design

python
import statsmodels.formula.api as smf
import pandas as pd

df = pd.DataFrame({
    'income': [50000, 60000, 75000, 80000, 95000],
    'education': [12, 14, 16, 16, 18],
    'age': [25, 30, 35, 40, 45]
})

# Create model object
model = smf.ols('income ~ education + age', data=df)

# Fit
results = model.fit()

# Access result attributes and methods
print(results.summary())
print(results.rsquared)
print(results.params)
print(results.pvalues)

Creating Your Own Data Science Classes

Example: Simple Linear Regression Class

python
import numpy as np

class SimpleLinearRegression:
    """Simple linear regression (educational)"""

    def __init__(self):
        self.slope = None
        self.intercept = None

    def fit(self, X, y):
        """Fit the model"""
        X = np.array(X)
        y = np.array(y)

        # Calculate slope and intercept
        x_mean = X.mean()
        y_mean = y.mean()

        numerator = ((X - x_mean) * (y - y_mean)).sum()
        denominator = ((X - x_mean) ** 2).sum()

        self.slope = numerator / denominator
        self.intercept = y_mean - self.slope * x_mean

        return self  # Return self (support method chaining)

    def predict(self, X):
        """Predict"""
        if self.slope is None:
            raise ValueError("Model not trained, please call fit() first")

        X = np.array(X)
        return self.slope * X + self.intercept

    def score(self, X, y):
        """Calculate R²"""
        y_pred = self.predict(X)
        ss_res = ((y - y_pred) ** 2).sum()
        ss_tot = ((y - y.mean()) ** 2).sum()
        return 1 - (ss_res / ss_tot)

    def __repr__(self):
        if self.slope is None:
            return "SimpleLinearRegression(unfitted)"
        return f"SimpleLinearRegression(slope={self.slope:.2f}, intercept={self.intercept:.2f})"

# Usage
X = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]

model = SimpleLinearRegression()
model.fit(X, y)
print(model)  # SimpleLinearRegression(slope=0.60, intercept=2.20)

predictions = model.predict([6, 7, 8])
print(predictions)  # [5.8 6.4 7. ]

r2 = model.score(X, y)
print(f"R² = {r2:.3f}")

Hands-On: Data Analysis Pipeline Class

python
class DataPipeline:
    """Data processing pipeline"""

    def __init__(self, df):
        self.df = df.copy()
        self.original_df = df.copy()
        self.steps = []

    def remove_missing(self, subset=None):
        """Remove missing values"""
        self.df = self.df.dropna(subset=subset)
        self.steps.append("remove_missing")
        return self  # Return self to support method chaining

    def filter_age(self, min_age, max_age):
        """Filter by age"""
        self.df = self.df[(self.df['age'] >= min_age) & (self.df['age'] <= max_age)]
        self.steps.append(f"filter_age({min_age}, {max_age})")
        return self

    def standardize(self, columns):
        """Standardize"""
        for col in columns:
            mean = self.df[col].mean()
            std = self.df[col].std()
            self.df[f'{col}_std'] = (self.df[col] - mean) / std
        self.steps.append(f"standardize({columns})")
        return self

    def get_result(self):
        """Get result"""
        return self.df

    def get_summary(self):
        """Processing summary"""
        print("=== Data Processing Pipeline ===")
        print(f"Original data: {len(self.original_df)} rows")
        print(f"After processing: {len(self.df)} rows")
        print("\nProcessing steps:")
        for i, step in enumerate(self.steps, 1):
            print(f"  {i}. {step}")

# Usage
import pandas as pd

df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'age': [25, 30, None, 40, 15, 50],
    'income': [50000, 60000, 75000, None, 30000, 90000]
})

# Method chaining
pipeline = DataPipeline(df)
result = (pipeline
    .remove_missing()
    .filter_age(18, 65)
    .standardize(['income'])
    .get_result()
)

pipeline.get_summary()
print("\nProcessed data:")
print(result)

OOP Best Practices (Data Science)

1. Design Chainable Methods

python
class DataCleaner:
    def __init__(self, df):
        self.df = df

    def drop_na(self):
        self.df = self.df.dropna()
        return self  # Return self

    def remove_outliers(self, column):
        q1 = self.df[column].quantile(0.25)
        q3 = self.df[column].quantile(0.75)
        iqr = q3 - q1
        self.df = self.df[
            (self.df[column] >= q1 - 1.5 * iqr) &
            (self.df[column] <= q3 + 1.5 * iqr)
        ]
        return self

# Method chaining
cleaner = DataCleaner(df)
result = (cleaner
    .drop_na()
    .remove_outliers('income')
    .df
)

2. Use Attributes to Store Metadata

python
class Model:
    def __init__(self):
        self.is_fitted = False
        self.n_features = None

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.is_fitted = True
        # Training logic...

3. Implement __repr__ for Debugging

python
class Survey:
    def __repr__(self):
        return f"Survey(name='{self.name}', n={len(self.responses)})"

Summary

OOP essentials for social science students:

  1. Pandas DataFrame is an object

    python
    df.head()  # Method
    df.shape   # Attribute
  2. Scikit-learn models are objects

    python
    model = LinearRegression()
    model.fit(X, y)
    model.predict(X_new)
  3. You don't need to write complex classes, but you should know how to use them

When to create your own classes?

  • Building reusable data pipelines
  • Encapsulating complex analysis logic
  • Large projects need code organization

Next Steps

Module 6 complete! In the next module, we'll learn file operations, including reading and writing CSV, Excel, Stata, and other data files.

Keep moving forward!


Released under the MIT License. Content © Author.