4.4 综合工作流：从数据到论文

"The best analysis is the one that answers the question.""最好的分析是能回答问题的分析。"— Roger Peng, Biostatistician (生物统计学家)

Python 统计分析完整实战

案例目标

研究问题：教育对工资的因果效应（使用父亲教育作为工具变量）

完整流程：

数据准备（pandas）
描述统计（pandas + scipy.stats）
OLS 回归（statsmodels）
面板数据分析（linearmodels）
工具变量估计（linearmodels.IV2SLS）
输出论文表格

完整代码

python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import PanelOLS
from linearmodels.iv import IV2SLS

# 设置
sns.set_style('whitegrid')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
np.random.seed(42)

print("=" * 70)
print("Python 统计分析完整工作流")
print("=" * 70)

# ========== 步骤 1：数据生成 ==========
print("\n【步骤 1】数据生成")

n = 2000
data = pd.DataFrame({
    'person_id': range(n),
    'education': np.random.randint(8, 21, n),
    'father_education': np.random.randint(6, 19, n),
    'experience': np.random.randint(0, 30, n),
    'ability': np.random.normal(0, 1, n),  # 不可观测
    'female': np.random.binomial(1, 0.5, n),
    'urban': np.random.binomial(1, 0.6, n)
})

# 工资生成（包含内生性）
data['log_wage'] = (8 + 
                    0.08 * data['education'] + 
                    0.03 * data['experience'] +
                    0.5 * data['ability'] +  # 能力影响工资和教育
                    -0.15 * data['female'] +
                    0.1 * data['urban'] +
                    np.random.normal(0, 0.3, n))

data['wage'] = np.exp(data['log_wage'])

print(f" 生成 {len(data)} 个观测")

# ========== 步骤 2：描述统计 ==========
print("\n【步骤 2】描述统计")

print("\n核心变量:")
print(data[['wage', 'log_wage', 'education', 'experience']].describe())

print("\n按性别的平均工资:")
gender_stats = data.groupby('female')['wage'].agg(['mean', 'median', 'std'])
gender_stats.index = ['Male', 'Female']
print(gender_stats)

# t 检验：性别工资差异
male_wage = data[data['female'] == 0]['log_wage']
female_wage = data[data['female'] == 1]['log_wage']
t_stat, p_value = stats.ttest_ind(male_wage, female_wage)
print(f"\n性别工资差异 t 检验: t = {t_stat:.3f}, p = {p_value:.4f}")

# 相关性
print("\n教育与工资的相关性:")
corr, p = stats.pearsonr(data['education'], data['log_wage'])
print(f"Pearson r = {corr:.3f} (p = {p:.4f})")

# ========== 步骤 3：OLS 回归（Statsmodels）==========
print("\n【步骤 3】OLS 回归")

# 模型 1：简单回归
model1 = smf.ols('log_wage ~ education + experience', data=data).fit()

# 模型 2：加入控制变量
model2 = smf.ols('log_wage ~ education + experience + female + urban', 
                 data=data).fit(cov_type='HC3')

# 模型 3：加入交互项
model3 = smf.ols('log_wage ~ education + experience + female + urban + education:experience', 
                 data=data).fit(cov_type='HC3')

print("\nOLS 回归结果对比:")
results_ols = summary_col(
    [model1, model2, model3],
    model_names=['(1) Basic', '(2) +Controls', '(3) +Interaction'],
    stars=True,
    info_dict={
        'N': lambda x: f"{int(x.nobs):,}",
        'R²': lambda x: f"{x.rsquared:.3f}"
    }
)
print(results_ols)

# 诊断
print("\n模型诊断:")
print(f"  VIF (education): {model2.get_influence().summary_frame()['hat'].mean():.3f}")

from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(model2.resid, model2.model.exog)
print(f"  BP 异方差检验 p-value: {bp_test[1]:.4f}")

# ========== 步骤 4：面板数据（假设有 3 年数据）==========
print("\n【步骤 4】面板数据分析")

# 扩展为面板数据（3 年）
panel_data = []
for year in [2018, 2019, 2020]:
    df_year = data.copy()
    df_year['year'] = year
    df_year['log_wage'] = df_year['log_wage'] + 0.02 * (year - 2018) + np.random.normal(0, 0.05, len(df_year))
    panel_data.append(df_year)

panel_df = pd.concat(panel_data, ignore_index=True)
panel_df = panel_df.set_index(['person_id', 'year'])

# 固定效应回归
fe_model = PanelOLS(
    dependent=panel_df['log_wage'],
    exog=panel_df[['education', 'experience', 'female', 'urban']],
    entity_effects=True,
    time_effects=True
).fit(cov_type='clustered', cluster_entity=True)

print("\n固定效应回归结果:")
print(fe_model)

# ========== 步骤 5：工具变量（2SLS）==========
print("\n【步骤 5】工具变量估计（2SLS）")

# 问题：教育与能力（不可观测）相关 → 内生性
# 解决：使用父亲教育作为工具变量

iv_model = IV2SLS(
    dependent=data['log_wage'],
    exog=data[['experience', 'female', 'urban']],
    endog=data[['education']],
    instruments=data[['father_education']]
).fit(cov_type='robust')

print("\n2SLS 回归结果:")
print(iv_model)

# 第一阶段 F 统计量
print(f"\n第一阶段 F-stat: {iv_model.first_stage.diagnostics['f.stat']['education']:.2f}")

# ========== 步骤 6：结果对比表格 ==========
print("\n【步骤 6】生成论文表格")

# 注意：statsmodels 和 linearmodels 的输出格式不同
# 需要手动提取系数

results_dict = {
    'OLS': {
        'education': f"{model2.params['education']:.4f} ({model2.bse['education']:.4f})",
        'experience': f"{model2.params['experience']:.4f} ({model2.bse['experience']:.4f})",
        'N': f"{int(model2.nobs):,}",
        'R²': f"{model2.rsquared:.3f}"
    },
    'FE': {
        'education': f"{fe_model.params['education']:.4f} ({fe_model.std_errors['education']:.4f})",
        'experience': f"{fe_model.params['experience']:.4f} ({fe_model.std_errors['experience']:.4f})",
        'N': f"{int(fe_model.nobs):,}",
        'R²': f"{fe_model.rsquared_within:.3f}"
    },
    '2SLS': {
        'education': f"{iv_model.params['education']:.4f} ({iv_model.std_errors['education']:.4f})",
        'experience': f"{iv_model.params['experience']:.4f} ({iv_model.std_errors['experience']:.4f})",
        'N': f"{int(iv_model.nobs):,}",
        'R²': f"-"  # IV 模型无 R²
    }
}

results_table = pd.DataFrame(results_dict).T
print("\n回归结果对比:")
print(results_table)

# 保存为 LaTeX
latex_output = """
\\begin{table}[htbp]
\\centering
\\caption{教育对工资的因果效应}
\\begin{tabular}{lccc}
\\hline\\hline
 & (1) OLS & (2) FE & (3) 2SLS \\\\
\\hline
"""

for var in ['education', 'experience']:
    latex_output += f"{var} & {results_dict['OLS'][var]} & {results_dict['FE'][var]} & {results_dict['2SLS'][var]} \\\\\n"

latex_output += """\\hline
N & {0} & {1} & {2} \\\\
R² & {3} & {4} & {5} \\\\
\\hline\\hline
\\end{{tabular}}
\\end{{table}}
""".format(
    results_dict['OLS']['N'],
    results_dict['FE']['N'],
    results_dict['2SLS']['N'],
    results_dict['OLS']['R²'],
    results_dict['FE']['R²'],
    results_dict['2SLS']['R²']
)

with open('regression_table.tex', 'w') as f:
    f.write(latex_output)

print("\n LaTeX 表格已保存至 regression_table.tex")

# ========== 步骤 7：可视化 ==========
print("\n【步骤 7】结果可视化")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (1) 教育 vs 工资
axes[0, 0].scatter(data['education'], data['log_wage'], alpha=0.3, s=10)
axes[0, 0].plot(np.unique(data['education']),
                np.poly1d(np.polyfit(data['education'], data['log_wage'], 1))(np.unique(data['education'])),
                color='red', linewidth=2)
axes[0, 0].set_xlabel('Education (years)')
axes[0, 0].set_ylabel('log(Wage)')
axes[0, 0].set_title('(1) Education vs Wage')

# (2) 回归系数对比
methods = ['OLS', 'FE', '2SLS']
edu_coefs = [model2.params['education'], 
             fe_model.params['education'],
             iv_model.params['education']]
edu_se = [model2.bse['education'],
          fe_model.std_errors['education'],
          iv_model.std_errors['education']]

axes[0, 1].errorbar(methods, edu_coefs, yerr=[1.96*s for s in edu_se],
                    fmt='o', capsize=5, capthick=2, markersize=8)
axes[0, 1].set_ylabel('Education Coefficient')
axes[0, 1].set_title('(2) Coefficient Comparison')
axes[0, 1].grid(alpha=0.3)

# (3) 残差分布
axes[1, 0].hist(model2.resid, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_title('(3) OLS Residuals Distribution')

# (4) 工资分布（按教育分组）
for edu_level in [12, 16, 20]:
    subset = data[data['education'] == edu_level]['log_wage']
    axes[1, 1].hist(subset, bins=30, alpha=0.5, label=f'Education={edu_level}')

axes[1, 1].set_xlabel('log(Wage)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('(4) Wage Distribution by Education')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('analysis_results.png', dpi=300, bbox_inches='tight')
print(" 可视化已保存至 analysis_results.png")

# ========== 总结 ==========
print("\n" + "=" * 70)
print("分析完成！")
print("=" * 70)
print(f"""
【主要发现】
1. OLS 估计：教育回报率 = {model2.params['education']:.4f} (SE = {model2.bse['education']:.4f})
2. FE 估计：教育回报率 = {fe_model.params['education']:.4f} (SE = {fe_model.std_errors['education']:.4f})
3. 2SLS 估计：教育回报率 = {iv_model.params['education']:.4f} (SE = {iv_model.std_errors['education']:.4f})

【解释】
- OLS 可能有偏（内生性：能力同时影响教育和工资）
- FE 控制个体固定效应，但可能仍有偏
- 2SLS 使用父亲教育作为工具变量，提供更可信的因果估计

【文件输出】
- regression_table.tex：论文级回归表格
- analysis_results.png：可视化结果
""")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279

工具包使用总结

任务映射

分析任务	推荐工具	代码示例
描述统计	pandas + scipy	`df.describe()`, `stats.ttest_ind()`
OLS 回归	statsmodels	`smf.ols().fit()`
模型诊断	statsmodels	`het_breuschpagan()`, VIF
面板数据	linearmodels	`PanelOLS().fit()`
工具变量	linearmodels	`IV2SLS().fit()`
输出表格	statsmodels	`summary_col()`, `.as_latex()`

Module 4 完成！

你已经掌握：

Statsmodels 完整功能
SciPy.stats 快速检验
LinearModels 面板数据与 IV
从数据到论文的完整工作流

下一个 Module：时间序列分析或高级计量方法

参考资源：

QuantEcon: https://quantecon.org/
Python for Econometrics: Kevin Sheppard

4.4 综合工作流：从数据到论文 ​

案例目标 ​

完整代码 ​

工具包使用总结 ​

任务映射 ​