6.4 Regression Visualization
"Visualization is the language of discovery."— John W. Tukey, Statistician
Diagnosing models, presenting results
Section Objectives
After completing this section, you will be able to:
- Create regression fit plots
- Plot residual diagnostic plots (four-in-one)
- Visualize regression coefficients and confidence intervals
- Display prediction results
Four-in-One Regression Diagnostic Plots
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
# Generate data and fit model
np.random.seed(42)
n = 200
education = np.random.normal(13, 3, n)
experience = np.random.uniform(0, 30, n)
log_wage = 1.5 + 0.08*education + 0.03*experience - 0.0005*experience**2 + np.random.normal(0, 0.3, n)
df = pd.DataFrame({
'log_wage': log_wage,
'education': education,
'experience': experience,
'experience_sq': experience**2
})
# Regression
X = sm.add_constant(df[['education', 'experience', 'experience_sq']])
model = sm.OLS(df['log_wage'], X).fit()
# Get diagnostic statistics
influence = model.get_influence()
standardized_resid = influence.resid_studentized_internal
leverage = influence.hat_matrix_diag
# Four-in-one diagnostic plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Residuals vs Fitted Values
axes[0, 0].scatter(model.fittedvalues, model.resid, alpha=0.5)
axes[0, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Fitted Values')
axes[0, 0].grid(True, alpha=0.3)
# Add LOWESS curve
from statsmodels.nonparametric.smoothers_lowess import lowess
lowess_result = lowess(model.resid, model.fittedvalues, frac=0.3)
axes[0, 0].plot(lowess_result[:, 0], lowess_result[:, 1], 'b-', linewidth=2)
# 2. Q-Q Plot
stats.probplot(model.resid, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Normal Q-Q Plot')
axes[0, 1].grid(True, alpha=0.3)
# 3. Scale-Location
axes[1, 0].scatter(model.fittedvalues, np.sqrt(np.abs(standardized_resid)), alpha=0.5)
axes[1, 0].set_xlabel('Fitted Values')
axes[1, 0].set_ylabel('√|Standardized Residuals|')
axes[1, 0].set_title('Scale-Location Plot')
axes[1, 0].grid(True, alpha=0.3)
# 4. Residuals vs Leverage
axes[1, 1].scatter(leverage, standardized_resid, alpha=0.5)
axes[1, 1].set_xlabel('Leverage')
axes[1, 1].set_ylabel('Standardized Residuals')
axes[1, 1].set_title('Residuals vs Leverage')
axes[1, 1].grid(True, alpha=0.3)
plt.suptitle('Regression Diagnostic Plots', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()Coefficient Plot
python
# Extract coefficients and confidence intervals
coefs = model.params.drop('const')
ci = model.conf_int().drop('const')
ci_lower = ci[0]
ci_upper = ci[1]
# Plot coefficient plot
fig, ax = plt.subplots(figsize=(10, 6))
y_pos = np.arange(len(coefs))
ax.errorbar(coefs, y_pos, xerr=[coefs - ci_lower, ci_upper - coefs],
fmt='o', markersize=8, capsize=5, capthick=2, linewidth=2)
ax.axvline(x=0, color='red', linestyle='--', linewidth=1.5, alpha=0.7)
ax.set_yticks(y_pos)
ax.set_yticklabels(coefs.index)
ax.set_xlabel('Coefficient Estimate')
ax.set_title('Regression Coefficients with 95% Confidence Intervals', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()Prediction Visualization
python
# Predict wages for different education levels
edu_range = np.linspace(6, 20, 100)
pred_data = pd.DataFrame({
'const': 1,
'education': edu_range,
'experience': 10, # Fix experience=10 years
'experience_sq': 100
})
# Predictions
predictions = model.get_prediction(pred_data)
pred_summary = predictions.summary_frame(alpha=0.05)
# Plot
plt.figure(figsize=(10, 6))
# Actual data (experience ≈ 10 years)
mask = (df['experience'] >= 8) & (df['experience'] <= 12)
plt.scatter(df.loc[mask, 'education'], df.loc[mask, 'log_wage'],
alpha=0.5, s=50, label='Actual Data (Experience≈10 years)')
# Prediction line
plt.plot(edu_range, pred_summary['mean'], 'r-', linewidth=2, label='Predicted Mean')
# Confidence interval
plt.fill_between(edu_range, pred_summary['mean_ci_lower'], pred_summary['mean_ci_upper'],
alpha=0.2, color='red', label='95% Confidence Interval')
plt.xlabel('Years of Education', fontsize=12)
plt.ylabel('log(Wage)', fontsize=12)
plt.title('Predicted Impact of Education on Wage (Controlling for Experience=10 years)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Section Summary
Key Diagnostic Plots
- Residuals vs Fitted Values: Test linearity and homoscedasticity
- Q-Q Plot: Test normality
- Scale-Location: Test homoscedasticity
- Residuals vs Leverage: Identify influential points
Next Section Preview
In the next section, we will learn how to compare distributions across multiple groups.
Continue deepening visualization skills!