4.4 综合工作流:从数据到论文
"The best analysis is the one that answers the question.""最好的分析是能回答问题的分析。"— Roger Peng, Biostatistician (生物统计学家)
Python 统计分析完整实战
案例目标
研究问题:教育对工资的因果效应(使用父亲教育作为工具变量)
完整流程:
- 数据准备(pandas)
- 描述统计(pandas + scipy.stats)
- OLS 回归(statsmodels)
- 面板数据分析(linearmodels)
- 工具变量估计(linearmodels.IV2SLS)
- 输出论文表格
完整代码
python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import PanelOLS
from linearmodels.iv import IV2SLS
# 设置
sns.set_style('whitegrid')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
np.random.seed(42)
print("=" * 70)
print("Python 统计分析完整工作流")
print("=" * 70)
# ========== 步骤 1:数据生成 ==========
print("\n【步骤 1】数据生成")
n = 2000
data = pd.DataFrame({
'person_id': range(n),
'education': np.random.randint(8, 21, n),
'father_education': np.random.randint(6, 19, n),
'experience': np.random.randint(0, 30, n),
'ability': np.random.normal(0, 1, n), # 不可观测
'female': np.random.binomial(1, 0.5, n),
'urban': np.random.binomial(1, 0.6, n)
})
# 工资生成(包含内生性)
data['log_wage'] = (8 +
0.08 * data['education'] +
0.03 * data['experience'] +
0.5 * data['ability'] + # 能力影响工资和教育
-0.15 * data['female'] +
0.1 * data['urban'] +
np.random.normal(0, 0.3, n))
data['wage'] = np.exp(data['log_wage'])
print(f" 生成 {len(data)} 个观测")
# ========== 步骤 2:描述统计 ==========
print("\n【步骤 2】描述统计")
print("\n核心变量:")
print(data[['wage', 'log_wage', 'education', 'experience']].describe())
print("\n按性别的平均工资:")
gender_stats = data.groupby('female')['wage'].agg(['mean', 'median', 'std'])
gender_stats.index = ['Male', 'Female']
print(gender_stats)
# t 检验:性别工资差异
male_wage = data[data['female'] == 0]['log_wage']
female_wage = data[data['female'] == 1]['log_wage']
t_stat, p_value = stats.ttest_ind(male_wage, female_wage)
print(f"\n性别工资差异 t 检验: t = {t_stat:.3f}, p = {p_value:.4f}")
# 相关性
print("\n教育与工资的相关性:")
corr, p = stats.pearsonr(data['education'], data['log_wage'])
print(f"Pearson r = {corr:.3f} (p = {p:.4f})")
# ========== 步骤 3:OLS 回归(Statsmodels)==========
print("\n【步骤 3】OLS 回归")
# 模型 1:简单回归
model1 = smf.ols('log_wage ~ education + experience', data=data).fit()
# 模型 2:加入控制变量
model2 = smf.ols('log_wage ~ education + experience + female + urban',
data=data).fit(cov_type='HC3')
# 模型 3:加入交互项
model3 = smf.ols('log_wage ~ education + experience + female + urban + education:experience',
data=data).fit(cov_type='HC3')
print("\nOLS 回归结果对比:")
results_ols = summary_col(
[model1, model2, model3],
model_names=['(1) Basic', '(2) +Controls', '(3) +Interaction'],
stars=True,
info_dict={
'N': lambda x: f"{int(x.nobs):,}",
'R²': lambda x: f"{x.rsquared:.3f}"
}
)
print(results_ols)
# 诊断
print("\n模型诊断:")
print(f" VIF (education): {model2.get_influence().summary_frame()['hat'].mean():.3f}")
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(model2.resid, model2.model.exog)
print(f" BP 异方差检验 p-value: {bp_test[1]:.4f}")
# ========== 步骤 4:面板数据(假设有 3 年数据)==========
print("\n【步骤 4】面板数据分析")
# 扩展为面板数据(3 年)
panel_data = []
for year in [2018, 2019, 2020]:
df_year = data.copy()
df_year['year'] = year
df_year['log_wage'] = df_year['log_wage'] + 0.02 * (year - 2018) + np.random.normal(0, 0.05, len(df_year))
panel_data.append(df_year)
panel_df = pd.concat(panel_data, ignore_index=True)
panel_df = panel_df.set_index(['person_id', 'year'])
# 固定效应回归
fe_model = PanelOLS(
dependent=panel_df['log_wage'],
exog=panel_df[['education', 'experience', 'female', 'urban']],
entity_effects=True,
time_effects=True
).fit(cov_type='clustered', cluster_entity=True)
print("\n固定效应回归结果:")
print(fe_model)
# ========== 步骤 5:工具变量(2SLS)==========
print("\n【步骤 5】工具变量估计(2SLS)")
# 问题:教育与能力(不可观测)相关 → 内生性
# 解决:使用父亲教育作为工具变量
iv_model = IV2SLS(
dependent=data['log_wage'],
exog=data[['experience', 'female', 'urban']],
endog=data[['education']],
instruments=data[['father_education']]
).fit(cov_type='robust')
print("\n2SLS 回归结果:")
print(iv_model)
# 第一阶段 F 统计量
print(f"\n第一阶段 F-stat: {iv_model.first_stage.diagnostics['f.stat']['education']:.2f}")
# ========== 步骤 6:结果对比表格 ==========
print("\n【步骤 6】生成论文表格")
# 注意:statsmodels 和 linearmodels 的输出格式不同
# 需要手动提取系数
results_dict = {
'OLS': {
'education': f"{model2.params['education']:.4f} ({model2.bse['education']:.4f})",
'experience': f"{model2.params['experience']:.4f} ({model2.bse['experience']:.4f})",
'N': f"{int(model2.nobs):,}",
'R²': f"{model2.rsquared:.3f}"
},
'FE': {
'education': f"{fe_model.params['education']:.4f} ({fe_model.std_errors['education']:.4f})",
'experience': f"{fe_model.params['experience']:.4f} ({fe_model.std_errors['experience']:.4f})",
'N': f"{int(fe_model.nobs):,}",
'R²': f"{fe_model.rsquared_within:.3f}"
},
'2SLS': {
'education': f"{iv_model.params['education']:.4f} ({iv_model.std_errors['education']:.4f})",
'experience': f"{iv_model.params['experience']:.4f} ({iv_model.std_errors['experience']:.4f})",
'N': f"{int(iv_model.nobs):,}",
'R²': f"-" # IV 模型无 R²
}
}
results_table = pd.DataFrame(results_dict).T
print("\n回归结果对比:")
print(results_table)
# 保存为 LaTeX
latex_output = """
\\begin{table}[htbp]
\\centering
\\caption{教育对工资的因果效应}
\\begin{tabular}{lccc}
\\hline\\hline
& (1) OLS & (2) FE & (3) 2SLS \\\\
\\hline
"""
for var in ['education', 'experience']:
latex_output += f"{var} & {results_dict['OLS'][var]} & {results_dict['FE'][var]} & {results_dict['2SLS'][var]} \\\\\n"
latex_output += """\\hline
N & {0} & {1} & {2} \\\\
R² & {3} & {4} & {5} \\\\
\\hline\\hline
\\end{{tabular}}
\\end{{table}}
""".format(
results_dict['OLS']['N'],
results_dict['FE']['N'],
results_dict['2SLS']['N'],
results_dict['OLS']['R²'],
results_dict['FE']['R²'],
results_dict['2SLS']['R²']
)
with open('regression_table.tex', 'w') as f:
f.write(latex_output)
print("\n LaTeX 表格已保存至 regression_table.tex")
# ========== 步骤 7:可视化 ==========
print("\n【步骤 7】结果可视化")
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# (1) 教育 vs 工资
axes[0, 0].scatter(data['education'], data['log_wage'], alpha=0.3, s=10)
axes[0, 0].plot(np.unique(data['education']),
np.poly1d(np.polyfit(data['education'], data['log_wage'], 1))(np.unique(data['education'])),
color='red', linewidth=2)
axes[0, 0].set_xlabel('Education (years)')
axes[0, 0].set_ylabel('log(Wage)')
axes[0, 0].set_title('(1) Education vs Wage')
# (2) 回归系数对比
methods = ['OLS', 'FE', '2SLS']
edu_coefs = [model2.params['education'],
fe_model.params['education'],
iv_model.params['education']]
edu_se = [model2.bse['education'],
fe_model.std_errors['education'],
iv_model.std_errors['education']]
axes[0, 1].errorbar(methods, edu_coefs, yerr=[1.96*s for s in edu_se],
fmt='o', capsize=5, capthick=2, markersize=8)
axes[0, 1].set_ylabel('Education Coefficient')
axes[0, 1].set_title('(2) Coefficient Comparison')
axes[0, 1].grid(alpha=0.3)
# (3) 残差分布
axes[1, 0].hist(model2.resid, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_title('(3) OLS Residuals Distribution')
# (4) 工资分布(按教育分组)
for edu_level in [12, 16, 20]:
subset = data[data['education'] == edu_level]['log_wage']
axes[1, 1].hist(subset, bins=30, alpha=0.5, label=f'Education={edu_level}')
axes[1, 1].set_xlabel('log(Wage)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('(4) Wage Distribution by Education')
axes[1, 1].legend()
plt.tight_layout()
plt.savefig('analysis_results.png', dpi=300, bbox_inches='tight')
print(" 可视化已保存至 analysis_results.png")
# ========== 总结 ==========
print("\n" + "=" * 70)
print("分析完成!")
print("=" * 70)
print(f"""
【主要发现】
1. OLS 估计:教育回报率 = {model2.params['education']:.4f} (SE = {model2.bse['education']:.4f})
2. FE 估计:教育回报率 = {fe_model.params['education']:.4f} (SE = {fe_model.std_errors['education']:.4f})
3. 2SLS 估计:教育回报率 = {iv_model.params['education']:.4f} (SE = {iv_model.std_errors['education']:.4f})
【解释】
- OLS 可能有偏(内生性:能力同时影响教育和工资)
- FE 控制个体固定效应,但可能仍有偏
- 2SLS 使用父亲教育作为工具变量,提供更可信的因果估计
【文件输出】
- regression_table.tex:论文级回归表格
- analysis_results.png:可视化结果
""")工具包使用总结
任务映射
| 分析任务 | 推荐工具 | 代码示例 |
|---|---|---|
| 描述统计 | pandas + scipy | df.describe(), stats.ttest_ind() |
| OLS 回归 | statsmodels | smf.ols().fit() |
| 模型诊断 | statsmodels | het_breuschpagan(), VIF |
| 面板数据 | linearmodels | PanelOLS().fit() |
| 工具变量 | linearmodels | IV2SLS().fit() |
| 输出表格 | statsmodels | summary_col(), .as_latex() |
Module 4 完成!
你已经掌握:
- Statsmodels 完整功能
- SciPy.stats 快速检验
- LinearModels 面板数据与 IV
- 从数据到论文的完整工作流
下一个 Module:时间序列分析或高级计量方法
参考资源:
- QuantEcon: https://quantecon.org/
- Python for Econometrics: Kevin Sheppard