Skip to content

4.4 Integrated Workflow: From Data to Publication

"The best analysis is the one that answers the question."— Roger Peng, Biostatistician

Complete Python Statistical Analysis in Practice


Case Objective

Research Question: Causal effect of education on wages (using father's education as instrumental variable)

Complete Workflow:

  1. Data preparation (pandas)
  2. Descriptive statistics (pandas + scipy.stats)
  3. OLS regression (statsmodels)
  4. Panel data analysis (linearmodels)
  5. Instrumental variable estimation (linearmodels.IV2SLS)
  6. Output publication tables

Complete Code

python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import PanelOLS
from linearmodels.iv import IV2SLS

# Settings
sns.set_style('whitegrid')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
np.random.seed(42)

print("=" * 70)
print("Complete Python Statistical Analysis Workflow")
print("=" * 70)

# ========== Step 1: Data Generation ==========
print("\n【Step 1】Data Generation")

n = 2000
data = pd.DataFrame({
    'person_id': range(n),
    'education': np.random.randint(8, 21, n),
    'father_education': np.random.randint(6, 19, n),
    'experience': np.random.randint(0, 30, n),
    'ability': np.random.normal(0, 1, n),  # Unobservable
    'female': np.random.binomial(1, 0.5, n),
    'urban': np.random.binomial(1, 0.6, n)
})

# Wage generation (includes endogeneity)
data['log_wage'] = (8 +
                    0.08 * data['education'] +
                    0.03 * data['experience'] +
                    0.5 * data['ability'] +  # Ability affects both wage and education
                    -0.15 * data['female'] +
                    0.1 * data['urban'] +
                    np.random.normal(0, 0.3, n))

data['wage'] = np.exp(data['log_wage'])

print(f"Generated {len(data)} observations")

# ========== Step 2: Descriptive Statistics ==========
print("\n【Step 2】Descriptive Statistics")

print("\nCore variables:")
print(data[['wage', 'log_wage', 'education', 'experience']].describe())

print("\nAverage wage by gender:")
gender_stats = data.groupby('female')['wage'].agg(['mean', 'median', 'std'])
gender_stats.index = ['Male', 'Female']
print(gender_stats)

# t-test: Gender wage gap
male_wage = data[data['female'] == 0]['log_wage']
female_wage = data[data['female'] == 1]['log_wage']
t_stat, p_value = stats.ttest_ind(male_wage, female_wage)
print(f"\nGender wage gap t-test: t = {t_stat:.3f}, p = {p_value:.4f}")

# Correlation
print("\nEducation-wage correlation:")
corr, p = stats.pearsonr(data['education'], data['log_wage'])
print(f"Pearson r = {corr:.3f} (p = {p:.4f})")

# ========== Step 3: OLS Regression (Statsmodels) ==========
print("\n【Step 3】OLS Regression")

# Model 1: Simple regression
model1 = smf.ols('log_wage ~ education + experience', data=data).fit()

# Model 2: Add controls
model2 = smf.ols('log_wage ~ education + experience + female + urban',
                 data=data).fit(cov_type='HC3')

# Model 3: Add interaction
model3 = smf.ols('log_wage ~ education + experience + female + urban + education:experience',
                 data=data).fit(cov_type='HC3')

print("\nOLS Regression Comparison:")
results_ols = summary_col(
    [model1, model2, model3],
    model_names=['(1) Basic', '(2) +Controls', '(3) +Interaction'],
    stars=True,
    info_dict={
        'N': lambda x: f"{int(x.nobs):,}",
        'R²': lambda x: f"{x.rsquared:.3f}"
    }
)
print(results_ols)

# Diagnostics
print("\nModel diagnostics:")
print(f"  VIF (education): {model2.get_influence().summary_frame()['hat'].mean():.3f}")

from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(model2.resid, model2.model.exog)
print(f"  BP heteroskedasticity test p-value: {bp_test[1]:.4f}")

# ========== Step 4: Panel Data (assuming 3 years) ==========
print("\n【Step 4】Panel Data Analysis")

# Expand to panel data (3 years)
panel_data = []
for year in [2018, 2019, 2020]:
    df_year = data.copy()
    df_year['year'] = year
    df_year['log_wage'] = df_year['log_wage'] + 0.02 * (year - 2018) + np.random.normal(0, 0.05, len(df_year))
    panel_data.append(df_year)

panel_df = pd.concat(panel_data, ignore_index=True)
panel_df = panel_df.set_index(['person_id', 'year'])

# Fixed effects regression
fe_model = PanelOLS(
    dependent=panel_df['log_wage'],
    exog=panel_df[['education', 'experience', 'female', 'urban']],
    entity_effects=True,
    time_effects=True
).fit(cov_type='clustered', cluster_entity=True)

print("\nFixed Effects Regression Results:")
print(fe_model)

# ========== Step 5: Instrumental Variables (2SLS) ==========
print("\n【Step 5】Instrumental Variable Estimation (2SLS)")

# Problem: Education correlates with ability (unobservable) → Endogeneity
# Solution: Use father's education as instrument

iv_model = IV2SLS(
    dependent=data['log_wage'],
    exog=data[['experience', 'female', 'urban']],
    endog=data[['education']],
    instruments=data[['father_education']]
).fit(cov_type='robust')

print("\n2SLS Regression Results:")
print(iv_model)

# First stage F-statistic
print(f"\nFirst stage F-stat: {iv_model.first_stage.diagnostics['f.stat']['education']:.2f}")

# ========== Step 6: Results Comparison Table ==========
print("\n【Step 6】Generate Publication Table")

# Note: statsmodels and linearmodels have different output formats
# Manual coefficient extraction required

results_dict = {
    'OLS': {
        'education': f"{model2.params['education']:.4f} ({model2.bse['education']:.4f})",
        'experience': f"{model2.params['experience']:.4f} ({model2.bse['experience']:.4f})",
        'N': f"{int(model2.nobs):,}",
        'R²': f"{model2.rsquared:.3f}"
    },
    'FE': {
        'education': f"{fe_model.params['education']:.4f} ({fe_model.std_errors['education']:.4f})",
        'experience': f"{fe_model.params['experience']:.4f} ({fe_model.std_errors['experience']:.4f})",
        'N': f"{int(fe_model.nobs):,}",
        'R²': f"{fe_model.rsquared_within:.3f}"
    },
    '2SLS': {
        'education': f"{iv_model.params['education']:.4f} ({iv_model.std_errors['education']:.4f})",
        'experience': f"{iv_model.params['experience']:.4f} ({iv_model.std_errors['experience']:.4f})",
        'N': f"{int(iv_model.nobs):,}",
        'R²': f"-"  # IV models don't have R²
    }
}

results_table = pd.DataFrame(results_dict).T
print("\nRegression Results Comparison:")
print(results_table)

# Save as LaTeX
latex_output = """
\\begin{table}[htbp]
\\centering
\\caption{Causal Effect of Education on Wages}
\\begin{tabular}{lccc}
\\hline\\hline
 & (1) OLS & (2) FE & (3) 2SLS \\\\
\\hline
"""

for var in ['education', 'experience']:
    latex_output += f"{var} & {results_dict['OLS'][var]} & {results_dict['FE'][var]} & {results_dict['2SLS'][var]} \\\\\n"

latex_output += """\\hline
N & {0} & {1} & {2} \\\\
R² & {3} & {4} & {5} \\\\
\\hline\\hline
\\end{{tabular}}
\\end{{table}}
""".format(
    results_dict['OLS']['N'],
    results_dict['FE']['N'],
    results_dict['2SLS']['N'],
    results_dict['OLS']['R²'],
    results_dict['FE']['R²'],
    results_dict['2SLS']['R²']
)

with open('regression_table.tex', 'w') as f:
    f.write(latex_output)

print("\nLaTeX table saved to regression_table.tex")

# ========== Step 7: Visualization ==========
print("\n【Step 7】Results Visualization")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (1) Education vs Wage
axes[0, 0].scatter(data['education'], data['log_wage'], alpha=0.3, s=10)
axes[0, 0].plot(np.unique(data['education']),
                np.poly1d(np.polyfit(data['education'], data['log_wage'], 1))(np.unique(data['education'])),
                color='red', linewidth=2)
axes[0, 0].set_xlabel('Education (years)')
axes[0, 0].set_ylabel('log(Wage)')
axes[0, 0].set_title('(1) Education vs Wage')

# (2) Coefficient comparison
methods = ['OLS', 'FE', '2SLS']
edu_coefs = [model2.params['education'],
             fe_model.params['education'],
             iv_model.params['education']]
edu_se = [model2.bse['education'],
          fe_model.std_errors['education'],
          iv_model.std_errors['education']]

axes[0, 1].errorbar(methods, edu_coefs, yerr=[1.96*s for s in edu_se],
                    fmt='o', capsize=5, capthick=2, markersize=8)
axes[0, 1].set_ylabel('Education Coefficient')
axes[0, 1].set_title('(2) Coefficient Comparison')
axes[0, 1].grid(alpha=0.3)

# (3) Residual distribution
axes[1, 0].hist(model2.resid, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_title('(3) OLS Residuals Distribution')

# (4) Wage distribution by education
for edu_level in [12, 16, 20]:
    subset = data[data['education'] == edu_level]['log_wage']
    axes[1, 1].hist(subset, bins=30, alpha=0.5, label=f'Education={edu_level}')

axes[1, 1].set_xlabel('log(Wage)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('(4) Wage Distribution by Education')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('analysis_results.png', dpi=300, bbox_inches='tight')
print("Visualization saved to analysis_results.png")

# ========== Summary ==========
print("\n" + "=" * 70)
print("Analysis Complete!")
print("=" * 70)
print(f"""
【Key Findings】
1. OLS estimate: Returns to education = {model2.params['education']:.4f} (SE = {model2.bse['education']:.4f})
2. FE estimate: Returns to education = {fe_model.params['education']:.4f} (SE = {fe_model.std_errors['education']:.4f})
3. 2SLS estimate: Returns to education = {iv_model.params['education']:.4f} (SE = {iv_model.std_errors['education']:.4f})

【Interpretation】
- OLS may be biased (endogeneity: ability affects both education and wages)
- FE controls for individual fixed effects but may still be biased
- 2SLS uses father's education as instrument, providing more credible causal estimate

【Output Files】
- regression_table.tex: Publication-quality regression table
- analysis_results.png: Visualization results
""")

Package Usage Summary

Task Mapping

Analysis TaskRecommended ToolCode Example
Descriptive statisticspandas + scipydf.describe(), stats.ttest_ind()
OLS regressionstatsmodelssmf.ols().fit()
Model diagnosticsstatsmodelshet_breuschpagan(), VIF
Panel datalinearmodelsPanelOLS().fit()
Instrumental variableslinearmodelsIV2SLS().fit()
Output tablesstatsmodelssummary_col(), .as_latex()

Module 4 Complete!

You have now mastered:

  • Complete Statsmodels functionality
  • SciPy.stats rapid testing
  • LinearModels panel data and IV
  • Complete workflow from data to publication

Next Module: Time series analysis or advanced econometric methods


Reference Resources:

Released under the MIT License. Content © Author.