3.7 完整案例：工资决定研究

"In God we trust, all others must bring data.""除了上帝，其他人都必须用数据说话。"— W. Edwards Deming, Statistician & Quality Control Expert (统计学家、质量管理专家)

从原始数据到回归分析的完整工作流

案例目标

研究问题：教育对工资的因果效应是多少？

数据来源：模拟的 NLSY（美国全国青年纵向调查）数据

完整流程：

数据导入与检查
数据清洗
变量构造
数据转换
描述性统计
回归分析
结果可视化与报告

步骤 1：数据导入与检查

python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm

# 设置
sns.set_style('whitegrid')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
np.random.seed(42)

# 1. 读取数据
print("=" * 70)
print("【步骤 1】数据导入")
print("=" * 70)

df_raw = pd.read_csv('wage_data_raw.csv', encoding='utf-8')
print(f" 成功读取 {len(df_raw)} 行, {df_raw.shape[1]} 列")

# 2. 初步检查
print("\n前 5 行:")
print(df_raw.head())

print("\n数据类型:")
print(df_raw.dtypes)

print("\n基本统计:")
print(df_raw.describe())

# 3. 缺失值
print("\n缺失值:")
missing = df_raw.isnull().sum()
print(missing[missing > 0])

# 4. 重复值
print(f"\n重复行数: {df_raw.duplicated().sum()}")

步骤 2：数据清洗

python

print("\n" + "=" * 70)
print("【步骤 2】数据清洗")
print("=" * 70)

df = df_raw.copy()  # 保留原始数据

# 1. 删除完全重复的行
before = len(df)
df = df.drop_duplicates()
print(f" 删除重复行: {before - len(df)} 行")

# 2. 处理缺失值
# 删除工资或教育缺失的行（关键变量）
before = len(df)
df = df.dropna(subset=['wage', 'education'])
print(f" 删除关键变量缺失: {before - len(df)} 行")

# 其他变量用中位数/众数填充
df['age'].fillna(df['age'].median(), inplace=True)
df['experience'].fillna(df['experience'].median(), inplace=True)

# 3. 处理异常值
# 删除明显错误的值
before = len(df)
df = df[
    (df['age'] >= 18) & (df['age'] <= 65) &
    (df['wage'] > 0) & (df['wage'] < 500000) &
    (df['education'] >= 0) & (df['education'] <= 30) &
    (df['experience'] >= 0) & (df['experience'] <= 50)
]
print(f" 删除异常值: {before - len(df)} 行")

# 4. Winsorize 收入（处理极端值）
from scipy.stats.mstats import winsorize
df['wage'] = winsorize(df['wage'], limits=[0.01, 0.01])
print(" Winsorize wage (1%-99%)")

# 5. 统一字符串格式
df['gender'] = df['gender'].str.lower().str.strip()
df['race'] = df['race'].str.lower().str.strip()
df['region'] = df['region'].str.lower().str.strip()

print(f"\n清洗后: {len(df)} 行, {df.shape[1]} 列")

步骤 3：变量构造

python

print("\n" + "=" * 70)
print("【步骤 3】变量构造")
print("=" * 70)

# 1. 连续变量转换
df['log_wage'] = np.log(df['wage'])
df['experience'] = df['age'] - df['education'] - 6  # 重新计算经验
df['experience_sq'] = df['experience'] ** 2  # 二次项

print(" 创建 log_wage, experience, experience_sq")

# 2. 虚拟变量
df['female'] = (df['gender'] == 'female').astype(int)
df['married'] = (df['marital_status'] == 'married').astype(int)
df['urban'] = (df['area_type'] == 'urban').astype(int)

# 种族虚拟变量
race_dummies = pd.get_dummies(df['race'], prefix='race', drop_first=True)
df = pd.concat([df, race_dummies], axis=1)

# 地区虚拟变量
region_dummies = pd.get_dummies(df['region'], prefix='region', drop_first=True)
df = pd.concat([df, region_dummies], axis=1)

print(" 创建虚拟变量: female, married, urban, race_*, region_*")

# 3. 交互项
df['edu_x_experience'] = df['education'] * df['experience']
df['female_x_married'] = df['female'] * df['married']
df['education_x_urban'] = df['education'] * df['urban']

print(" 创建交互项: edu_x_experience, female_x_married, education_x_urban")

# 4. 分组统计变量
df['region_mean_wage'] = df.groupby('region')['wage'].transform('mean')
df['wage_deviation'] = df['wage'] - df['region_mean_wage']

print(" 创建分组变量: region_mean_wage, wage_deviation")

print(f"\n变量构造后: {df.shape[1]} 列")

步骤 4：描述性统计

python

print("\n" + "=" * 70)
print("【步骤 4】描述性统计")
print("=" * 70)

# 1. 核心变量统计
print("\n核心变量:")
print(df[['wage', 'log_wage', 'education', 'experience', 'age']].describe())

# 2. 分类变量频数
print("\n性别分布:")
print(df['gender'].value_counts())

print("\n种族分布:")
print(df['race'].value_counts())

print("\n地区分布:")
print(df['region'].value_counts())

# 3. 分组统计
print("\n按性别的平均工资:")
print(df.groupby('gender')['wage'].agg(['mean', 'median', 'std']))

print("\n按教育水平的平均工资:")
df['education_group'] = pd.cut(df['education'],
                               bins=[0, 12, 16, 30],
                               labels=['高中以下', '高中-大学', '大学以上'])
print(df.groupby('education_group')['wage'].mean())

# 4. 相关性矩阵
print("\n相关性矩阵:")
corr_vars = ['log_wage', 'education', 'experience', 'age', 'female']
print(df[corr_vars].corr().round(3))

步骤 5：可视化

python

print("\n" + "=" * 70)
print("【步骤 5】数据可视化")
print("=" * 70)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('工资决定研究：探索性数据分析', fontsize=16, fontweight='bold')

# 子图 1：工资分布
axes[0, 0].hist(df['wage'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('工资（元）')
axes[0, 0].set_ylabel('频数')
axes[0, 0].set_title('(1) 工资分布（右偏）')

# 子图 2：log(工资) 分布
axes[0, 1].hist(df['log_wage'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].set_xlabel('log(工资)')
axes[0, 1].set_ylabel('频数')
axes[0, 1].set_title('(2) log(工资) 分布（接近正态）')

# 子图 3：教育 vs 工资（散点图）
axes[0, 2].scatter(df['education'], df['log_wage'], alpha=0.3, s=10)
z = np.polyfit(df['education'], df['log_wage'], 1)
p = np.poly1d(z)
axes[0, 2].plot(df['education'], p(df['education']), 'r--', linewidth=2)
axes[0, 2].set_xlabel('教育年限')
axes[0, 2].set_ylabel('log(工资)')
axes[0, 2].set_title('(3) 教育 vs log(工资)')

# 子图 4：按性别的工资箱线图
df.boxplot(column='log_wage', by='gender', ax=axes[1, 0])
axes[1, 0].set_xlabel('性别')
axes[1, 0].set_ylabel('log(工资)')
axes[1, 0].set_title('(4) 性别工资差异')
plt.sca(axes[1, 0])
plt.xticks([1, 2], ['Female', 'Male'])

# 子图 5：教育-经验-工资（3D 散点）
from mpl_toolkits.mplot3d import Axes3D
ax_3d = fig.add_subplot(2, 3, 5, projection='3d')
scatter = ax_3d.scatter(df['education'], df['experience'], df['log_wage'],
                        c=df['log_wage'], cmap='viridis', alpha=0.5, s=10)
ax_3d.set_xlabel('教育')
ax_3d.set_ylabel('经验')
ax_3d.set_zlabel('log(工资)')
ax_3d.set_title('(5) 教育-经验-工资关系')

# 子图 6：按教育分组的工资分布
axes[1, 2].violinplot([df[df['education_group'] == g]['log_wage'].dropna()
                        for g in ['高中以下', '高中-大学', '大学以上']],
                       positions=[1, 2, 3])
axes[1, 2].set_xticks([1, 2, 3])
axes[1, 2].set_xticklabels(['高中以下', '高中-大学', '大学以上'], rotation=15)
axes[1, 2].set_ylabel('log(工资)')
axes[1, 2].set_title('(6) 按教育分组的工资分布')

plt.tight_layout()
plt.savefig('wage_eda.png', dpi=300, bbox_inches='tight')
print(" 可视化已保存至 wage_eda.png")

步骤 6：回归分析

python

print("\n" + "=" * 70)
print("【步骤 6】回归分析")
print("=" * 70)

# 模型 1：简单回归（Mincer 方程基础版）
print("\n【模型 1】简单 Mincer 方程")
X1 = sm.add_constant(df[['education', 'experience', 'experience_sq']])
model1 = sm.OLS(df['log_wage'], X1).fit(cov_type='HC3')
print(model1.summary())

print(f"\n教育回报率: {model1.params['education']:.4f}")
print(f"解释：教育每增加 1 年，工资增长 {model1.params['education']*100:.2f}%")

# 模型 2：加入性别和婚姻状况
print("\n【模型 2】加入人口学变量")
X2 = sm.add_constant(df[['education', 'experience', 'experience_sq',
                         'female', 'married']])
model2 = sm.OLS(df['log_wage'], X2).fit(cov_type='HC3')

# 模型 3：加入种族和地区虚拟变量
print("\n【模型 3】加入种族和地区控制")
control_vars = ['education', 'experience', 'experience_sq',
                'female', 'married', 'urban']

# 添加种族虚拟变量
race_cols = [col for col in df.columns if col.startswith('race_')]
control_vars.extend(race_cols)

# 添加地区虚拟变量
region_cols = [col for col in df.columns if col.startswith('region_')]
control_vars.extend(region_cols)

X3 = sm.add_constant(df[control_vars])
model3 = sm.OLS(df['log_wage'], X3).fit(cov_type='HC3')

# 模型 4：加入交互项（教育 × 经验）
print("\n【模型 4】加入交互项")
X4 = sm.add_constant(df[control_vars + ['edu_x_experience']])
model4 = sm.OLS(df['log_wage'], X4).fit(cov_type='HC3')

# 回归结果对比表
from statsmodels.iolib.summary2 import summary_col

print("\n" + "=" * 70)
print("【回归结果对比】")
print("=" * 70)

results_table = summary_col(
    [model1, model2, model3, model4],
    model_names=['(1) 基础', '(2) +人口', '(3) +种族地区', '(4) +交互'],
    stars=True,
    info_dict={'N': lambda x: f"{int(x.nobs):,}",
               'R²': lambda x: f"{x.rsquared:.3f}"}
)

print(results_table)

# 保存回归结果
with open('regression_results.txt', 'w', encoding='utf-8') as f:
    f.write(results_table.as_text())
print("\n 回归结果已保存至 regression_results.txt")

步骤 7：结果解释与报告

python

print("\n" + "=" * 70)
print("【步骤 7】结果解释")
print("=" * 70)

# 提取关键系数
edu_coef = model3.params['education']
edu_se = model3.bse['education']
edu_pval = model3.pvalues['education']

female_coef = model3.params['female']
experience_coef = model3.params['experience']

print("\n【关键发现】")
print(f"1. 教育回报率: {edu_coef:.4f} ({edu_se:.4f})")
print(f"   解释：教育每增加 1 年，工资增长 {edu_coef*100:.2f}%")
print(f"   显著性: p = {edu_pval:.4f} ***")

print(f"\n2. 性别工资差距: {female_coef:.4f}")
print(f"   解释：女性工资比男性低 {abs(female_coef)*100:.2f}%")

print(f"\n3. 经验回报率: {experience_coef:.4f}")
print(f"   解释：经验每增加 1 年，工资增长 {experience_coef*100:.2f}%")

# 计算最优经验年限（二次项顶点）
exp_sq_coef = model3.params['experience_sq']
optimal_exp = -experience_coef / (2 * exp_sq_coef)
print(f"\n4. 最优经验年限: {optimal_exp:.1f} 年")

# 生成预测
print("\n【预测示例】")
# 预测：大学毕业、5 年经验、男性、白人、东部
pred_data = pd.DataFrame({
    'education': [16],
    'experience': [5],
    'experience_sq': [25],
    'female': [0],
    'married': [1],
    'urban': [1]
})

# 添加种族和地区虚拟变量（全为 0，即参照组）
for col in race_cols + region_cols:
    pred_data[col] = 0

pred_data_const = sm.add_constant(pred_data)
log_wage_pred = model3.predict(pred_data_const)[0]
wage_pred = np.exp(log_wage_pred)

print(f"预测工资: {wage_pred:,.2f} 元")

# 可视化预测
fig, ax = plt.subplots(figsize=(10, 6))

# 不同教育水平的工资-经验曲线
exp_range = np.arange(0, 40, 1)
for edu_level in [12, 16, 20]:
    wages = []
    for exp in exp_range:
        pred_temp = pred_data.copy()
        pred_temp['education'] = edu_level
        pred_temp['experience'] = exp
        pred_temp['experience_sq'] = exp ** 2
        pred_temp_const = sm.add_constant(pred_temp)
        log_w = model3.predict(pred_temp_const)[0]
        wages.append(np.exp(log_w))

    ax.plot(exp_range, wages, label=f'教育={edu_level}年', linewidth=2)

ax.set_xlabel('工作经验（年）', fontsize=12)
ax.set_ylabel('预测工资（元）', fontsize=12)
ax.set_title('工资-经验曲线（按教育水平）', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('wage_prediction.png', dpi=300, bbox_inches='tight')
print("\n 预测可视化已保存至 wage_prediction.png")

完整总结

python

print("\n" + "=" * 70)
print("【完整分析总结】")
print("=" * 70)

summary_report = f"""
## 工资决定研究：完整分析报告

### 1. 数据概况
- 原始样本: {len(df_raw)} 行
- 清洗后样本: {len(df)} 行
- 变量数: {df.shape[1]} 个

### 2. 主要发现

#### (1) 教育回报率: {edu_coef*100:.2f}%
- 教育每增加 1 年，工资增长 {edu_coef*100:.2f}%
- 统计显著性: p < 0.001
- 95% 置信区间: [{(edu_coef - 1.96*edu_se)*100:.2f}%, {(edu_coef + 1.96*edu_se)*100:.2f}%]

#### (2) 性别工资差距: {abs(female_coef)*100:.2f}%
- 女性工资比男性低 {abs(female_coef)*100:.2f}%（控制其他因素后）
- 这一差距无法由教育、经验等因素解释

#### (3) 经验回报率: {experience_coef*100:.2f}%
- 经验每增加 1 年，工资增长 {experience_coef*100:.2f}%
- 存在边际递减效应（经验平方项为负）
- 最优经验年限: {optimal_exp:.1f} 年

### 3. 模型拟合度
- R²: {model3.rsquared:.3f}
- 调整 R²: {model3.rsquared_adj:.3f}
- F 统计量: {model3.fvalue:.2f} (p < 0.001)

### 4. 稳健性检验
 使用异方差稳健标准误（HC3）
 Winsorize 处理极端值
 控制种族、地区固定效应

### 5. 政策含义
- 教育投资回报显著且稳健
- 性别工资差距需要政策干预
- 经验积累重要，但存在最优点

---
**分析完成时间**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
**分析工具**: Python + Pandas + Statsmodels
"""

print(summary_report)

# 保存报告
with open('wage_analysis_report.md', 'w', encoding='utf-8') as f:
    f.write(summary_report)

print("\n 完整报告已保存至 wage_analysis_report.md")
print("=" * 70)
print("分析完成！")
print("=" * 70)

小结

完整工作流回顾

步骤	任务	核心函数
1	数据导入	`pd.read_csv()`
2	数据清洗	`dropna()`, `winsorize()`
3	变量构造	`get_dummies()`, `transform()`
4	描述统计	`describe()`, `groupby()`
5	可视化	`matplotlib`, `seaborn`
6	回归分析	`sm.OLS()`
7	结果解释	系数解释、预测

关键要点

保留原始数据：使用 df.copy()
每步验证：检查数据形状、统计量
可重复性：设置随机种子、保存代码
稳健性：异方差稳健标准误、Winsorize
可视化：图表比数字更直观

练习

使用你自己的数据集，完成从数据导入到回归分析的完整流程。

Module 3 完成！

下一步：Module 4 - 假设检验与统计推断

参考资源：

Angrist & Pischke (2009): Mostly Harmless Econometrics
Wooldridge (2020): Introductory Econometrics (7th Ed)

3.7 完整案例：工资决定研究 ​

案例目标 ​

步骤 1：数据导入与检查 ​

步骤 2：数据清洗 ​

步骤 3：变量构造 ​

步骤 4：描述性统计 ​

步骤 5：可视化 ​

步骤 6：回归分析 ​

步骤 7：结果解释与报告 ​

完整总结 ​

小结 ​

完整工作流回顾 ​

关键要点 ​

练习 ​

3.7 完整案例：工资决定研究

案例目标

步骤 1：数据导入与检查

步骤 2：数据清洗

步骤 3：变量构造

步骤 4：描述性统计

步骤 5：可视化

步骤 6：回归分析

步骤 7：结果解释与报告

完整总结

小结

完整工作流回顾

关键要点

练习