OOP in Data Science
Understanding the Object-Oriented Design of Common Libraries
Why Do Data Science Libraries Use OOP?
Pandas, Scikit-learn, and Statsmodels are all object-oriented because:
- Data and methods are naturally bound together
- Method chaining is more fluent
- Code is easier to maintain
Pandas' OOP Design
DataFrame Objects
python
import pandas as pd
# DataFrame is a class
df = pd.DataFrame({
'age': [25, 30, 35, 40],
'income': [50000, 60000, 75000, 80000]
})
# Attributes
print(df.shape) # (4, 2)
print(df.columns) # Index(['age', 'income'])
print(df.dtypes) # Data types
# Methods (method chaining)
result = (df
.query('age > 30') # Filter
.assign(log_income=lambda x: np.log(x['income'])) # New column
.sort_values('income') # Sort
.reset_index(drop=True) # Reset index
)Series Objects
python
# Series is also an object
ages = pd.Series([25, 30, 35, 40], name='age')
# Methods
print(ages.mean()) # 32.5
print(ages.std()) # 6.45
print(ages.quantile(0.5)) # 32.5
# Method chaining
result = (ages
.apply(lambda x: x ** 2) # Square
.sort_values(ascending=False)
.head(3)
)Scikit-learn's OOP Design
Model Objects
python
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
# Create objects
model = LinearRegression()
scaler = StandardScaler()
# Train (fit method)
X = [[1], [2], [3], [4], [5]]
y = [2, 4, 6, 8, 10]
model.fit(X, y)
# Predict (predict method)
predictions = model.predict([[6], [7]])
print(predictions) # [12. 14.]
# Access attributes
print(model.coef_) # Coefficients
print(model.intercept_) # InterceptWhy Use OOP?
python
# Without OOP (hypothetical)
X_scaled = standard_scale(X)
model_params = fit_linear_regression(X_scaled, y)
predictions = predict_linear_regression(model_params, X_test)
# Using OOP (actual)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
model = LinearRegression()
model.fit(X_scaled, y)
predictions = model.predict(X_test_scaled)Statsmodels' OOP Design
python
import statsmodels.formula.api as smf
import pandas as pd
df = pd.DataFrame({
'income': [50000, 60000, 75000, 80000, 95000],
'education': [12, 14, 16, 16, 18],
'age': [25, 30, 35, 40, 45]
})
# Create model object
model = smf.ols('income ~ education + age', data=df)
# Fit
results = model.fit()
# Access result attributes and methods
print(results.summary())
print(results.rsquared)
print(results.params)
print(results.pvalues)Creating Your Own Data Science Classes
Example: Simple Linear Regression Class
python
import numpy as np
class SimpleLinearRegression:
"""Simple linear regression (educational)"""
def __init__(self):
self.slope = None
self.intercept = None
def fit(self, X, y):
"""Fit the model"""
X = np.array(X)
y = np.array(y)
# Calculate slope and intercept
x_mean = X.mean()
y_mean = y.mean()
numerator = ((X - x_mean) * (y - y_mean)).sum()
denominator = ((X - x_mean) ** 2).sum()
self.slope = numerator / denominator
self.intercept = y_mean - self.slope * x_mean
return self # Return self (support method chaining)
def predict(self, X):
"""Predict"""
if self.slope is None:
raise ValueError("Model not trained, please call fit() first")
X = np.array(X)
return self.slope * X + self.intercept
def score(self, X, y):
"""Calculate R²"""
y_pred = self.predict(X)
ss_res = ((y - y_pred) ** 2).sum()
ss_tot = ((y - y.mean()) ** 2).sum()
return 1 - (ss_res / ss_tot)
def __repr__(self):
if self.slope is None:
return "SimpleLinearRegression(unfitted)"
return f"SimpleLinearRegression(slope={self.slope:.2f}, intercept={self.intercept:.2f})"
# Usage
X = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]
model = SimpleLinearRegression()
model.fit(X, y)
print(model) # SimpleLinearRegression(slope=0.60, intercept=2.20)
predictions = model.predict([6, 7, 8])
print(predictions) # [5.8 6.4 7. ]
r2 = model.score(X, y)
print(f"R² = {r2:.3f}")Hands-On: Data Analysis Pipeline Class
python
class DataPipeline:
"""Data processing pipeline"""
def __init__(self, df):
self.df = df.copy()
self.original_df = df.copy()
self.steps = []
def remove_missing(self, subset=None):
"""Remove missing values"""
self.df = self.df.dropna(subset=subset)
self.steps.append("remove_missing")
return self # Return self to support method chaining
def filter_age(self, min_age, max_age):
"""Filter by age"""
self.df = self.df[(self.df['age'] >= min_age) & (self.df['age'] <= max_age)]
self.steps.append(f"filter_age({min_age}, {max_age})")
return self
def standardize(self, columns):
"""Standardize"""
for col in columns:
mean = self.df[col].mean()
std = self.df[col].std()
self.df[f'{col}_std'] = (self.df[col] - mean) / std
self.steps.append(f"standardize({columns})")
return self
def get_result(self):
"""Get result"""
return self.df
def get_summary(self):
"""Processing summary"""
print("=== Data Processing Pipeline ===")
print(f"Original data: {len(self.original_df)} rows")
print(f"After processing: {len(self.df)} rows")
print("\nProcessing steps:")
for i, step in enumerate(self.steps, 1):
print(f" {i}. {step}")
# Usage
import pandas as pd
df = pd.DataFrame({
'id': [1, 2, 3, 4, 5, 6],
'age': [25, 30, None, 40, 15, 50],
'income': [50000, 60000, 75000, None, 30000, 90000]
})
# Method chaining
pipeline = DataPipeline(df)
result = (pipeline
.remove_missing()
.filter_age(18, 65)
.standardize(['income'])
.get_result()
)
pipeline.get_summary()
print("\nProcessed data:")
print(result)OOP Best Practices (Data Science)
1. Design Chainable Methods
python
class DataCleaner:
def __init__(self, df):
self.df = df
def drop_na(self):
self.df = self.df.dropna()
return self # Return self
def remove_outliers(self, column):
q1 = self.df[column].quantile(0.25)
q3 = self.df[column].quantile(0.75)
iqr = q3 - q1
self.df = self.df[
(self.df[column] >= q1 - 1.5 * iqr) &
(self.df[column] <= q3 + 1.5 * iqr)
]
return self
# Method chaining
cleaner = DataCleaner(df)
result = (cleaner
.drop_na()
.remove_outliers('income')
.df
)2. Use Attributes to Store Metadata
python
class Model:
def __init__(self):
self.is_fitted = False
self.n_features = None
def fit(self, X, y):
self.n_features = X.shape[1]
self.is_fitted = True
# Training logic...3. Implement __repr__ for Debugging
python
class Survey:
def __repr__(self):
return f"Survey(name='{self.name}', n={len(self.responses)})"Summary
OOP essentials for social science students:
Pandas DataFrame is an object
pythondf.head() # Method df.shape # AttributeScikit-learn models are objects
pythonmodel = LinearRegression() model.fit(X, y) model.predict(X_new)You don't need to write complex classes, but you should know how to use them
When to create your own classes?
- Building reusable data pipelines
- Encapsulating complex analysis logic
- Large projects need code organization
Next Steps
Module 6 complete! In the next module, we'll learn file operations, including reading and writing CSV, Excel, Stata, and other data files.
Keep moving forward!