6.4 Regression Visualization

"Visualization is the language of discovery."— John W. Tukey, Statistician

Diagnosing models, presenting results

Section Objectives

After completing this section, you will be able to:

Create regression fit plots
Plot residual diagnostic plots (four-in-one)
Visualize regression coefficients and confidence intervals
Display prediction results

Four-in-One Regression Diagnostic Plots

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

# Generate data and fit model
np.random.seed(42)
n = 200
education = np.random.normal(13, 3, n)
experience = np.random.uniform(0, 30, n)
log_wage = 1.5 + 0.08*education + 0.03*experience - 0.0005*experience**2 + np.random.normal(0, 0.3, n)

df = pd.DataFrame({
    'log_wage': log_wage,
    'education': education,
    'experience': experience,
    'experience_sq': experience**2
})

# Regression
X = sm.add_constant(df[['education', 'experience', 'experience_sq']])
model = sm.OLS(df['log_wage'], X).fit()

# Get diagnostic statistics
influence = model.get_influence()
standardized_resid = influence.resid_studentized_internal
leverage = influence.hat_matrix_diag

# Four-in-one diagnostic plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Residuals vs Fitted Values
axes[0, 0].scatter(model.fittedvalues, model.resid, alpha=0.5)
axes[0, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Fitted Values')
axes[0, 0].grid(True, alpha=0.3)

# Add LOWESS curve
from statsmodels.nonparametric.smoothers_lowess import lowess
lowess_result = lowess(model.resid, model.fittedvalues, frac=0.3)
axes[0, 0].plot(lowess_result[:, 0], lowess_result[:, 1], 'b-', linewidth=2)

# 2. Q-Q Plot
stats.probplot(model.resid, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Normal Q-Q Plot')
axes[0, 1].grid(True, alpha=0.3)

# 3. Scale-Location
axes[1, 0].scatter(model.fittedvalues, np.sqrt(np.abs(standardized_resid)), alpha=0.5)
axes[1, 0].set_xlabel('Fitted Values')
axes[1, 0].set_ylabel('√|Standardized Residuals|')
axes[1, 0].set_title('Scale-Location Plot')
axes[1, 0].grid(True, alpha=0.3)

# 4. Residuals vs Leverage
axes[1, 1].scatter(leverage, standardized_resid, alpha=0.5)
axes[1, 1].set_xlabel('Leverage')
axes[1, 1].set_ylabel('Standardized Residuals')
axes[1, 1].set_title('Residuals vs Leverage')
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Regression Diagnostic Plots', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

Coefficient Plot

python

# Extract coefficients and confidence intervals
coefs = model.params.drop('const')
ci = model.conf_int().drop('const')
ci_lower = ci[0]
ci_upper = ci[1]

# Plot coefficient plot
fig, ax = plt.subplots(figsize=(10, 6))
y_pos = np.arange(len(coefs))

ax.errorbar(coefs, y_pos, xerr=[coefs - ci_lower, ci_upper - coefs],
           fmt='o', markersize=8, capsize=5, capthick=2, linewidth=2)
ax.axvline(x=0, color='red', linestyle='--', linewidth=1.5, alpha=0.7)
ax.set_yticks(y_pos)
ax.set_yticklabels(coefs.index)
ax.set_xlabel('Coefficient Estimate')
ax.set_title('Regression Coefficients with 95% Confidence Intervals', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

Prediction Visualization

python

# Predict wages for different education levels
edu_range = np.linspace(6, 20, 100)
pred_data = pd.DataFrame({
    'const': 1,
    'education': edu_range,
    'experience': 10,  # Fix experience=10 years
    'experience_sq': 100
})

# Predictions
predictions = model.get_prediction(pred_data)
pred_summary = predictions.summary_frame(alpha=0.05)

# Plot
plt.figure(figsize=(10, 6))

# Actual data (experience ≈ 10 years)
mask = (df['experience'] >= 8) & (df['experience'] <= 12)
plt.scatter(df.loc[mask, 'education'], df.loc[mask, 'log_wage'],
           alpha=0.5, s=50, label='Actual Data (Experience≈10 years)')

# Prediction line
plt.plot(edu_range, pred_summary['mean'], 'r-', linewidth=2, label='Predicted Mean')

# Confidence interval
plt.fill_between(edu_range, pred_summary['mean_ci_lower'], pred_summary['mean_ci_upper'],
                alpha=0.2, color='red', label='95% Confidence Interval')

plt.xlabel('Years of Education', fontsize=12)
plt.ylabel('log(Wage)', fontsize=12)
plt.title('Predicted Impact of Education on Wage (Controlling for Experience=10 years)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Section Summary

Key Diagnostic Plots

Residuals vs Fitted Values: Test linearity and homoscedasticity
Q-Q Plot: Test normality
Scale-Location: Test homoscedasticity
Residuals vs Leverage: Identify influential points

Next Section Preview

In the next section, we will learn how to compare distributions across multiple groups.

Continue deepening visualization skills!

6.4 Regression Visualization ​

Section Objectives ​

Four-in-One Regression Diagnostic Plots ​

Coefficient Plot ​

Prediction Visualization ​

Section Summary ​

Key Diagnostic Plots ​

Next Section Preview ​

6.4 Regression Visualization

Section Objectives

Four-in-One Regression Diagnostic Plots

Coefficient Plot

Prediction Visualization

Section Summary

Key Diagnostic Plots

Next Section Preview