Skip to content

12.4 Python实现与经典案例

"Causal forests enable us to move from average effects to personalized predictions.""因果森林使我们能够从平均效应转向个性化预测。"— Stefan Wager, Stanford Statistician (斯坦福统计学家)

从理论到实践:三个经典案例的完整实现

难度实践导向


本节目标

完成本节后,你将能够:

  • 使用econml和causalml库实现Causal Forest
  • 复现经典401(k)养老金计划案例
  • 分析LaLonde就业培训数据
  • 实现个性化医疗决策
  • 可视化和解释异质性处理效应
  • 掌握完整的因果推断工作流

环境配置

安装依赖

bash
# 核心因果推断库
pip install econml causalml

# 数据处理
pip install pandas numpy scipy

# 可视化
pip install matplotlib seaborn plotly

# 机器学习
pip install scikit-learn xgboost lightgbm

# 统计工具
pip install statsmodels

导入库

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Causal ML libraries
from econml.grf import CausalForest as EconMLCausalForest
from econml.dml import CausalForestDML
from econml.cate_interpreter import SingleTreeCateInterpreter
from causalml.inference.tree import UpliftRandomForestClassifier

# Traditional ML
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 设置中文字体(Mac)
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")

print("=" * 70)
print("环境配置完成!")
print("=" * 70)

案例1:401(k)养老金计划

背景

研究问题:雇主提供的401(k)计划对员工储蓄的影响是否具有异质性?

数据来源:Chernozhukov et al. (2018) 使用的经典数据集

变量

  • 处理:是否参与401(k)计划 (e401)
  • 结果:净资产 (net_tfa)
  • 协变量:年龄、收入、教育、婚姻状况等

数据准备

python
# 加载数据(从econml示例数据)
from econml.solutions.causal_analysis import CausalAnalysis

# 模拟401(k)数据(实际应用中从真实数据加载)
np.random.seed(42)
n = 9915  # 接近真实样本量

# 生成协变量
age = np.random.uniform(25, 64, n)
income = np.random.lognormal(10.5, 0.8, n)  # 收入
education = np.random.choice([12, 14, 16, 18], n, p=[0.3, 0.3, 0.3, 0.1])
marital_status = np.random.binomial(1, 0.6, n)  # 1=已婚
family_size = np.random.poisson(2, n) + 1

# 处理分配(有选择性偏差)
# 高收入、高学历者更可能参与401(k)
propensity_score = 1 / (1 + np.exp(-(
    -2 +
    0.02 * (age - 45) +
    0.3 * np.log(income / 50000) +
    0.2 * (education - 12) / 4 +
    0.3 * marital_status
)))

e401 = np.random.binomial(1, propensity_score)

# 真实的异质性处理效应
# 低收入者受益最多(边际效用递减)
true_tau = (
    8000 +  # 基础效应
    10000 * np.exp(-income / 50000) +  # 低收入者受益多
    500 * (education - 12) +  # 教育正效应
    -100 * (age - 45) +  # 年龄负效应
    2000 * marital_status  # 已婚者受益多
)

# 结果变量
# 净资产 = 基础资产 + 401(k)效应
base_wealth = (
    10000 +
    1000 * income / 10000 +
    2000 * education +
    500 * age +
    5000 * marital_status +
    np.random.normal(0, 10000, n)
)

net_tfa = base_wealth + e401 * true_tau

# 创建DataFrame
df_401k = pd.DataFrame({
    'age': age,
    'income': income,
    'education': education,
    'marital_status': marital_status,
    'family_size': family_size,
    'e401': e401,
    'net_tfa': net_tfa,
    'true_tau': true_tau,
    'propensity_score': propensity_score
})

print("=" * 70)
print("401(k)数据集")
print("=" * 70)
print(f"样本量: {len(df_401k)}")
print(f"参与率: {df_401k['e401'].mean():.1%}")
print(f"\n描述统计:")
print(df_401k[['age', 'income', 'education', 'net_tfa']].describe())

探索性分析

python
# 1. 参与者 vs 非参与者对比
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 年龄分布
axes[0, 0].hist(df_401k[df_401k['e401']==1]['age'], bins=30, alpha=0.5,
                label='参与者', density=True, color='green')
axes[0, 0].hist(df_401k[df_401k['e401']==0]['age'], bins=30, alpha=0.5,
                label='非参与者', density=True, color='red')
axes[0, 0].set_xlabel('年龄', fontsize=11)
axes[0, 0].set_ylabel('密度', fontsize=11)
axes[0, 0].set_title('年龄分布', fontsize=12, fontweight='bold')
axes[0, 0].legend()

# 收入分布(对数)
axes[0, 1].hist(np.log(df_401k[df_401k['e401']==1]['income']), bins=30,
                alpha=0.5, label='参与者', density=True, color='green')
axes[0, 1].hist(np.log(df_401k[df_401k['e401']==0]['income']), bins=30,
                alpha=0.5, label='非参与者', density=True, color='red')
axes[0, 1].set_xlabel('log(收入)', fontsize=11)
axes[0, 1].set_ylabel('密度', fontsize=11)
axes[0, 1].set_title('收入分布', fontsize=12, fontweight='bold')
axes[0, 1].legend()

# 教育分布
edu_counts = df_401k.groupby(['education', 'e401']).size().unstack(fill_value=0)
edu_counts_pct = edu_counts.div(edu_counts.sum(axis=0), axis=1) * 100
edu_counts_pct.plot(kind='bar', ax=axes[1, 0], color=['red', 'green'], alpha=0.7)
axes[1, 0].set_xlabel('教育年限', fontsize=11)
axes[1, 0].set_ylabel('占比 (%)', fontsize=11)
axes[1, 0].set_title('教育分布', fontsize=12, fontweight='bold')
axes[1, 0].legend(['非参与者', '参与者'])
axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=0)

# 净资产分布
axes[1, 1].hist(df_401k[df_401k['e401']==1]['net_tfa'], bins=50, alpha=0.5,
                label='参与者', density=True, color='green', range=(0, 100000))
axes[1, 1].hist(df_401k[df_401k['e401']==0]['net_tfa'], bins=50, alpha=0.5,
                label='非参与者', density=True, color='red', range=(0, 100000))
axes[1, 1].set_xlabel('净资产 ($)', fontsize=11)
axes[1, 1].set_ylabel('密度', fontsize=11)
axes[1, 1].set_title('净资产分布', fontsize=12, fontweight='bold')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# 2. 简单对比(有偏估计)
ate_naive = df_401k[df_401k['e401']==1]['net_tfa'].mean() - \
            df_401k[df_401k['e401']==0]['net_tfa'].mean()

print("\n" + "=" * 70)
print("简单对比(Naive ATE)")
print("=" * 70)
print(f"参与者平均净资产: ${df_401k[df_401k['e401']==1]['net_tfa'].mean():,.0f}")
print(f"非参与者平均净资产: ${df_401k[df_401k['e401']==0]['net_tfa'].mean():,.0f}")
print(f"差异(Naive ATE): ${ate_naive:,.0f}")
print(f"真实ATE: ${df_401k['true_tau'].mean():,.0f}")
print(f"\n注意:Naive估计有偏差(选择性偏差)!")

使用Causal Forest估计CATE

python
# 准备数据
X = df_401k[['age', 'income', 'education', 'marital_status', 'family_size']].values
T = df_401k['e401'].values.reshape(-1, 1)
Y = df_401k['net_tfa'].values

# 划分训练集和测试集
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(
    X, T, Y, test_size=0.3, random_state=42
)

print("=" * 70)
print("训练Causal Forest")
print("=" * 70)

# 方法1:econml的Causal Forest
cf_econml = EconMLCausalForest(
    n_estimators=3000,
    min_samples_leaf=20,
    max_depth=None,
    max_features='sqrt',
    random_state=42,
    verbose=0
)

cf_econml.fit(X_train, T_train, Y_train)

# 预测CATE
tau_train = cf_econml.predict(X_train)
tau_test = cf_econml.predict(X_test)
tau_train_interval = cf_econml.predict_interval(X_train, alpha=0.05)
tau_test_interval = cf_econml.predict_interval(X_test, alpha=0.05)

print(f"训练集样本量: {len(X_train)}")
print(f"测试集样本量: {len(X_test)}")

# 评估
true_tau_train = df_401k.loc[X_train[:, 0] == df_401k['age'].values, 'true_tau'].values[:len(tau_train)]
true_tau_test = df_401k.loc[X_test[:, 0] == df_401k['age'].values, 'true_tau'].values[:len(tau_test)]

# 简化评估(使用整体数据)
df_401k['tau_pred'] = cf_econml.predict(X)

mse = mean_squared_error(df_401k['true_tau'], df_401k['tau_pred'])
r2 = r2_score(df_401k['true_tau'], df_401k['tau_pred'])

print(f"\nCATE预测性能:")
print(f"  MSE: {mse:,.0f}")
print(f"  R²: {r2:.3f}")
print(f"\n估计的ATE: ${df_401k['tau_pred'].mean():,.0f}")
print(f"真实的ATE: ${df_401k['true_tau'].mean():,.0f}")

异质性分析

python
# 1. 按收入分组
df_401k['income_quartile'] = pd.qcut(df_401k['income'], q=4,
                                     labels=['Q1(最低)', 'Q2', 'Q3', 'Q4(最高)'])

income_effects = df_401k.groupby('income_quartile').agg({
    'true_tau': 'mean',
    'tau_pred': 'mean',
    'income': 'mean'
}).round(0)

print("\n" + "=" * 70)
print("按收入四分位数的处理效应")
print("=" * 70)
print(income_effects)

# 2. 按年龄分组
df_401k['age_group'] = pd.cut(df_401k['age'], bins=[25, 35, 45, 55, 64],
                              labels=['25-35', '36-45', '46-55', '56-64'])

age_effects = df_401k.groupby('age_group').agg({
    'true_tau': 'mean',
    'tau_pred': 'mean',
    'age': 'mean'
}).round(0)

print("\n" + "=" * 70)
print("按年龄组的处理效应")
print("=" * 70)
print(age_effects)

# 3. 可视化异质性
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# (a) 真实 vs 预测
axes[0, 0].scatter(df_401k['true_tau'], df_401k['tau_pred'], alpha=0.2, s=10)
axes[0, 0].plot([df_401k['true_tau'].min(), df_401k['true_tau'].max()],
                [df_401k['true_tau'].min(), df_401k['true_tau'].max()],
                'r--', linewidth=2)
axes[0, 0].set_xlabel('真实CATE ($)', fontsize=11)
axes[0, 0].set_ylabel('预测CATE ($)', fontsize=11)
axes[0, 0].set_title(f'CATE预测准确性 (R² = {r2:.3f})', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# (b) 按收入的CATE
axes[0, 1].scatter(df_401k['income'], df_401k['tau_pred'], alpha=0.2, s=10, label='预测CATE')
# 添加平滑曲线
income_sorted = np.sort(df_401k['income'].values)
from scipy.ndimage import uniform_filter1d
tau_smoothed = uniform_filter1d(df_401k.sort_values('income')['tau_pred'].values, size=200)
axes[0, 1].plot(income_sorted, tau_smoothed, 'r-', linewidth=2.5, label='平滑趋势')
axes[0, 1].set_xlabel('收入 ($)', fontsize=11)
axes[0, 1].set_ylabel('处理效应 ($)', fontsize=11)
axes[0, 1].set_title('收入与401(k)效应', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# (c) 按年龄的CATE
axes[1, 0].scatter(df_401k['age'], df_401k['tau_pred'], alpha=0.2, s=10, label='预测CATE')
age_sorted = np.sort(df_401k['age'].values)
tau_age_smoothed = uniform_filter1d(df_401k.sort_values('age')['tau_pred'].values, size=200)
axes[1, 0].plot(age_sorted, tau_age_smoothed, 'r-', linewidth=2.5, label='平滑趋势')
axes[1, 0].set_xlabel('年龄', fontsize=11)
axes[1, 0].set_ylabel('处理效应 ($)', fontsize=11)
axes[1, 0].set_title('年龄与401(k)效应', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# (d) CATE分布
axes[1, 1].hist(df_401k['tau_pred'], bins=50, alpha=0.7, edgecolor='black', color='steelblue')
axes[1, 1].axvline(x=df_401k['tau_pred'].mean(), color='red', linestyle='--',
                   linewidth=2.5, label=f"ATE = ${df_401k['tau_pred'].mean():,.0f}")
axes[1, 1].set_xlabel('处理效应 ($)', fontsize=11)
axes[1, 1].set_ylabel('频数', fontsize=11)
axes[1, 1].set_title('401(k)处理效应分布', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

政策含义

python
# 政策分析:谁应该被鼓励参与401(k)?

# 识别高受益者和低受益者
threshold_pct = 50  # 中位数
threshold = df_401k['tau_pred'].median()

df_401k['high_benefit'] = (df_401k['tau_pred'] > threshold).astype(int)

print("=" * 70)
print("政策分析:高受益 vs 低受益群体")
print("=" * 70)

high_benefit_profile = df_401k[df_401k['high_benefit']==1][['age', 'income', 'education']].mean()
low_benefit_profile = df_401k[df_401k['high_benefit']==0][['age', 'income', 'education']].mean()

comparison = pd.DataFrame({
    '高受益群体': high_benefit_profile,
    '低受益群体': low_benefit_profile,
    '差异': high_benefit_profile - low_benefit_profile
})

print(comparison.round(1))

print(f"\n高受益群体特征:")
print(f"  平均年龄: {high_benefit_profile['age']:.1f} 岁")
print(f"  平均收入: ${high_benefit_profile['income']:,.0f}")
print(f"  平均教育: {high_benefit_profile['education']:.1f} 年")

print(f"\n政策建议:")
print(f"  - 重点针对低收入人群推广401(k)计划")
print(f"  - 年轻人和已婚人士应优先考虑")
print(f"  - 提供配套的财务教育")

案例2:LaLonde就业培训项目

背景

研究问题:就业培训对不同人群的收入提升是否有差异?

数据来源:LaLonde (1986), Dehejia & Wahba (1999)

经典案例:比较实验数据与观察性数据的因果推断

加载数据

python
# LaLonde数据集(这里模拟,实际应从statsmodels或网络加载)
np.random.seed(123)
n_nsw = 722  # NSW实验数据

# 协变量
age_lalonde = np.random.uniform(17, 55, n_nsw)
education_lalonde = np.random.choice(range(3, 17), n_nsw)
black = np.random.binomial(1, 0.84, n_nsw)  # NSW样本主要是黑人
hispanic = np.random.binomial(1, 0.06, n_nsw)
married = np.random.binomial(1, 0.19, n_nsw)
nodegree = np.random.binomial(1, 0.71, n_nsw)  # 无高中文凭
re74 = np.maximum(0, np.random.normal(2095, 4887, n_nsw))  # 1974年收入
re75 = np.maximum(0, np.random.normal(1532, 3219, n_nsw))  # 1975年收入

# 处理分配(RCT)
treat = np.random.binomial(1, 0.5, n_nsw)

# 真实的异质性效应
# 年轻人、有工作经验者受益更多
true_tau_lalonde = (
    1500 +  # 基础效应
    50 * (35 - age_lalonde) +  # 年轻人受益多
    100 * education_lalonde +  # 教育正效应
    0.3 * re74 +  # 之前有收入者受益多
    -500 * nodegree  # 无文凭者受益少
)

# 1978年收入(结果)
re78_0 = 3000 + 200 * education_lalonde + 0.5 * re74 + 0.3 * re75 + \
         1000 * (1 - nodegree) + np.random.normal(0, 2000, n_nsw)
re78 = re78_0 + treat * true_tau_lalonde

# 确保收入非负
re78 = np.maximum(0, re78)

df_lalonde = pd.DataFrame({
    'age': age_lalonde,
    'education': education_lalonde,
    'black': black,
    'hispanic': hispanic,
    'married': married,
    'nodegree': nodegree,
    're74': re74,
    're75': re75,
    'treat': treat,
    're78': re78,
    'true_tau': true_tau_lalonde
})

print("=" * 70)
print("LaLonde NSW数据集")
print("=" * 70)
print(f"样本量: {len(df_lalonde)}")
print(f"处理组: {df_lalonde['treat'].sum()}")
print(f"对照组: {(1-df_lalonde['treat']).sum()}")
print(f"\n人群特征:")
print(df_lalonde[['age', 'education', 'black', 'married', 're74', 're75']].describe())

Causal Forest分析

python
# 准备数据
X_lalonde = df_lalonde[['age', 'education', 'black', 'hispanic',
                        'married', 'nodegree', 're74', 're75']].values
T_lalonde = df_lalonde['treat'].values.reshape(-1, 1)
Y_lalonde = df_lalonde['re78'].values

# 训练Causal Forest
cf_lalonde = EconMLCausalForest(
    n_estimators=2000,
    min_samples_leaf=10,
    random_state=123
)

cf_lalonde.fit(X_lalonde, T_lalonde, Y_lalonde)

# 预测CATE
df_lalonde['tau_pred'] = cf_lalonde.predict(X_lalonde)
tau_interval_lalonde = cf_lalonde.predict_interval(X_lalonde, alpha=0.05)
df_lalonde['tau_lower'] = tau_interval_lalonde[0]
df_lalonde['tau_upper'] = tau_interval_lalonde[1]

print("=" * 70)
print("LaLonde数据:Causal Forest结果")
print("=" * 70)
print(f"估计的ATE: ${df_lalonde['tau_pred'].mean():,.0f}")
print(f"真实的ATE: ${df_lalonde['true_tau'].mean():,.0f}")
print(f"\nCATE范围: [${df_lalonde['tau_pred'].min():,.0f}, ${df_lalonde['tau_pred'].max():,.0f}]")

# 评估
mse_lalonde = mean_squared_error(df_lalonde['true_tau'], df_lalonde['tau_pred'])
r2_lalonde = r2_score(df_lalonde['true_tau'], df_lalonde['tau_pred'])
print(f"\n预测性能:")
print(f"  MSE: {mse_lalonde:,.0f}")
print(f"  R²: {r2_lalonde:.3f}")

异质性解释

python
# 使用CATE Interpreter
from econml.cate_interpreter import SingleTreeCateInterpreter

# 构建解释树
interpreter = SingleTreeCateInterpreter(
    include_model_uncertainty=False,
    max_depth=3,
    min_samples_leaf=10
)

interpreter.interpret(cf_lalonde, X_lalonde)
interpreter.plot(
    feature_names=['age', 'education', 'black', 'hispanic',
                   'married', 'nodegree', 're74', 're75'],
    fontsize=10
)
plt.title('LaLonde数据:CATE决策树', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# 按关键特征分组分析
print("\n" + "=" * 70)
print("异质性分析")
print("=" * 70)

# 1. 按年龄
df_lalonde['age_group_l'] = pd.cut(df_lalonde['age'],
                                   bins=[17, 25, 35, 55],
                                   labels=['17-25', '26-35', '36-55'])
age_effects_l = df_lalonde.groupby('age_group_l')['tau_pred'].agg(['mean', 'std', 'count'])
print("\n按年龄组:")
print(age_effects_l.round(0))

# 2. 按教育
df_lalonde['edu_level'] = pd.cut(df_lalonde['education'],
                                 bins=[0, 12, 16, 20],
                                 labels=['<高中', '高中-大学', '>大学'])
edu_effects_l = df_lalonde.groupby('edu_level')['tau_pred'].agg(['mean', 'std', 'count'])
print("\n按教育水平:")
print(edu_effects_l.round(0))

# 3. 按1974年收入
df_lalonde['re74_group'] = pd.cut(df_lalonde['re74'],
                                  bins=[-1, 0, 2000, 1000000],
                                  labels=['无收入', '低收入', '中高收入'])
re74_effects = df_lalonde.groupby('re74_group')['tau_pred'].agg(['mean', 'std', 'count'])
print("\n按1974年收入:")
print(re74_effects.round(0))

可视化

python
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (a) CATE vs 年龄
axes[0, 0].scatter(df_lalonde['age'], df_lalonde['tau_pred'], alpha=0.4, s=20)
age_bins = np.linspace(17, 55, 20)
age_means = [df_lalonde[(df_lalonde['age'] >= age_bins[i]) &
                        (df_lalonde['age'] < age_bins[i+1])]['tau_pred'].mean()
             for i in range(len(age_bins)-1)]
axes[0, 0].plot(age_bins[:-1], age_means, 'r-', linewidth=2.5, label='平均趋势')
axes[0, 0].set_xlabel('年龄', fontsize=11)
axes[0, 0].set_ylabel('处理效应 ($)', fontsize=11)
axes[0, 0].set_title('年龄与培训效应', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# (b) CATE vs 教育
axes[0, 1].scatter(df_lalonde['education'], df_lalonde['tau_pred'], alpha=0.4, s=20)
edu_means = df_lalonde.groupby('education')['tau_pred'].mean()
axes[0, 1].plot(edu_means.index, edu_means.values, 'ro-', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('教育年限', fontsize=11)
axes[0, 1].set_ylabel('处理效应 ($)', fontsize=11)
axes[0, 1].set_title('教育与培训效应', fontsize=12, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# (c) CATE vs 1974年收入
axes[1, 0].scatter(df_lalonde['re74'], df_lalonde['tau_pred'], alpha=0.4, s=20)
axes[1, 0].set_xlabel('1974年收入 ($)', fontsize=11)
axes[1, 0].set_ylabel('处理效应 ($)', fontsize=11)
axes[1, 0].set_title('之前收入与培训效应', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# (d) 受益者分布
top_20_pct = df_lalonde['tau_pred'].quantile(0.8)
bottom_20_pct = df_lalonde['tau_pred'].quantile(0.2)

df_lalonde['benefit_group'] = pd.cut(df_lalonde['tau_pred'],
                                     bins=[-np.inf, bottom_20_pct, top_20_pct, np.inf],
                                     labels=['低受益', '中受益', '高受益'])

benefit_counts = df_lalonde['benefit_group'].value_counts()
colors_benefit = ['red', 'yellow', 'green']
axes[1, 1].bar(benefit_counts.index, benefit_counts.values, color=colors_benefit, alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('受益组', fontsize=11)
axes[1, 1].set_ylabel('人数', fontsize=11)
axes[1, 1].set_title('培训受益者分布', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# 高受益者特征
print("\n" + "=" * 70)
print("高受益者(Top 20%)特征")
print("=" * 70)
top_beneficiaries = df_lalonde[df_lalonde['benefit_group'] == '高受益']
print(top_beneficiaries[['age', 'education', 'married', 're74', 'tau_pred']].describe())

案例3:个性化医疗决策

背景

研究问题:某种治疗对不同患者的效果是否有差异?

场景:糖尿病药物试验

生成模拟医疗数据

python
np.random.seed(456)
n_patients = 2000

# 患者特征
age_patients = np.random.uniform(30, 80, n_patients)
bmi = np.random.normal(28, 5, n_patients)  # BMI
baseline_glucose = np.random.normal(140, 20, n_patients)  # 基线血糖
hypertension = np.random.binomial(1, 0.4, n_patients)  # 高血压
smoker = np.random.binomial(1, 0.25, n_patients)  # 吸烟
family_history = np.random.binomial(1, 0.6, n_patients)  # 家族史

# 处理分配(RCT)
treatment = np.random.binomial(1, 0.5, n_patients)

# 真实的异质性效应(血糖降低幅度)
# 年轻人、高BMI、高基线血糖者效果更好
true_tau_medical = (
    -15 +  # 基础效应(负值=降低血糖)
    -0.3 * (age_patients - 55) +  # 年轻人效果好
    -0.8 * (bmi - 28) +  # 高BMI效果好
    -0.2 * (baseline_glucose - 140) +  # 高血糖效果好
    5 * smoker +  # 吸烟者效果差
    -3 * family_history  # 有家族史效果好
)

# 结果:3个月后血糖水平
glucose_3months_0 = (
    baseline_glucose -
    5 -  # 自然下降
    0.1 * (age_patients - 55) +
    0.3 * bmi +
    np.random.normal(0, 8, n_patients)
)

glucose_3months = glucose_3months_0 + treatment * true_tau_medical

df_medical = pd.DataFrame({
    'age': age_patients,
    'bmi': bmi,
    'baseline_glucose': baseline_glucose,
    'hypertension': hypertension,
    'smoker': smoker,
    'family_history': family_history,
    'treatment': treatment,
    'glucose_3months': glucose_3months,
    'true_tau': true_tau_medical
})

print("=" * 70)
print("医疗数据集:糖尿病药物试验")
print("=" * 70)
print(f"样本量: {len(df_medical)}")
print(f"处理组: {df_medical['treatment'].sum()}")
print(f"对照组: {(1-df_medical['treatment']).sum()}")
print(f"\n患者特征:")
print(df_medical[['age', 'bmi', 'baseline_glucose']].describe())

Causal Forest分析

python
# 准备数据
X_medical = df_medical[['age', 'bmi', 'baseline_glucose',
                        'hypertension', 'smoker', 'family_history']].values
T_medical = df_medical['treatment'].values.reshape(-1, 1)
Y_medical = df_medical['glucose_3months'].values

# 训练Causal Forest
cf_medical = EconMLCausalForest(
    n_estimators=2500,
    min_samples_leaf=15,
    random_state=456
)

cf_medical.fit(X_medical, T_medical, Y_medical)

# 预测CATE(血糖降低幅度,负值表示降低)
df_medical['tau_pred'] = cf_medical.predict(X_medical)
tau_interval_medical = cf_medical.predict_interval(X_medical, alpha=0.05)

print("=" * 70)
print("医疗数据:Causal Forest结果")
print("=" * 70)
print(f"估计的ATE(血糖降低): {df_medical['tau_pred'].mean():.2f} mg/dL")
print(f"真实的ATE: {df_medical['true_tau'].mean():.2f} mg/dL")

# 评估
mse_medical = mean_squared_error(df_medical['true_tau'], df_medical['tau_pred'])
r2_medical = r2_score(df_medical['true_tau'], df_medical['tau_pred'])
print(f"\n预测性能:")
print(f"  MSE: {mse_medical:.2f}")
print(f"  R²: {r2_medical:.3f}")

个性化治疗决策

python
# 决策规则:只给效果显著的患者用药

# 考虑药物副作用成本(等价于需要血糖降低至少5mg/dL才值得)
threshold_benefit = -5  # 负值表示降低

# 最优决策
df_medical['should_treat'] = (df_medical['tau_pred'] < threshold_benefit).astype(int)

print("\n" + "=" * 70)
print("个性化治疗决策")
print("=" * 70)
print(f"决策阈值: 血糖降低至少 {-threshold_benefit} mg/dL")
print(f"推荐治疗比例: {df_medical['should_treat'].mean():.1%}")

# 应该治疗 vs 不应该治疗的患者特征
should_treat = df_medical[df_medical['should_treat'] == 1]
should_not_treat = df_medical[df_medical['should_treat'] == 0]

comparison_medical = pd.DataFrame({
    '应该治疗': should_treat[['age', 'bmi', 'baseline_glucose', 'smoker']].mean(),
    '不应该治疗': should_not_treat[['age', 'bmi', 'baseline_glucose', 'smoker']].mean()
})

print("\n患者特征对比:")
print(comparison_medical.round(1))

# 可视化
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (a) 真实 vs 预测
axes[0, 0].scatter(df_medical['true_tau'], df_medical['tau_pred'], alpha=0.3, s=15)
axes[0, 0].plot([df_medical['true_tau'].min(), df_medical['true_tau'].max()],
                [df_medical['true_tau'].min(), df_medical['true_tau'].max()],
                'r--', linewidth=2)
axes[0, 0].set_xlabel('真实治疗效应 (mg/dL)', fontsize=11)
axes[0, 0].set_ylabel('预测治疗效应 (mg/dL)', fontsize=11)
axes[0, 0].set_title(f'治疗效应预测 (R² = {r2_medical:.3f})', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# (b) 年龄与治疗效应
axes[0, 1].scatter(df_medical['age'], df_medical['tau_pred'], alpha=0.3, s=15,
                   c=df_medical['should_treat'], cmap='RdYlGn_r')
axes[0, 1].axhline(y=threshold_benefit, color='red', linestyle='--', linewidth=2,
                   label=f'决策阈值 ({threshold_benefit} mg/dL)')
axes[0, 1].set_xlabel('年龄', fontsize=11)
axes[0, 1].set_ylabel('治疗效应 (mg/dL)', fontsize=11)
axes[0, 1].set_title('年龄与治疗效应', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# (c) BMI与治疗效应
axes[1, 0].scatter(df_medical['bmi'], df_medical['tau_pred'], alpha=0.3, s=15,
                   c=df_medical['should_treat'], cmap='RdYlGn_r')
axes[1, 0].axhline(y=threshold_benefit, color='red', linestyle='--', linewidth=2,
                   label=f'决策阈值')
axes[1, 0].set_xlabel('BMI', fontsize=11)
axes[1, 0].set_ylabel('治疗效应 (mg/dL)', fontsize=11)
axes[1, 0].set_title('BMI与治疗效应', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# (d) 决策分布
decision_counts = df_medical['should_treat'].value_counts()
labels = ['不推荐', '推荐']
colors = ['red', 'green']
axes[1, 1].bar(labels, [decision_counts[0], decision_counts[1]],
               color=colors, alpha=0.7, edgecolor='black')
axes[1, 1].set_ylabel('患者数量', fontsize=11)
axes[1, 1].set_title('个性化治疗决策分布', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='y')

for i, v in enumerate([decision_counts[0], decision_counts[1]]):
    axes[1, 1].text(i, v + 20, str(v), ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

方法对比:econml vs causalml

使用causalml库

python
from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.metrics import plot_gain

# 注意:causalml的UpliftTree要求二分类结果
# 我们用401(k)数据,定义结果为"净资产>中位数"

df_401k['high_wealth'] = (df_401k['net_tfa'] > df_401k['net_tfa'].median()).astype(int)

# 准备数据
X_401k_causalml = df_401k[['age', 'income', 'education', 'marital_status', 'family_size']].values
treatment_401k = df_401k['e401'].values
y_401k_binary = df_401k['high_wealth'].values

# 训练Uplift Random Forest
uplift_model = UpliftRandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=20,
    random_state=42
)

uplift_model.fit(X_401k_causalml, treatment_401k, y_401k_binary)

# 预测uplift(处理效应)
uplift_pred = uplift_model.predict(X_401k_causalml)

print("=" * 70)
print("causalml: Uplift Random Forest")
print("=" * 70)
print(f"平均uplift: {uplift_pred.mean():.4f}")
print(f"Uplift范围: [{uplift_pred.min():.4f}, {uplift_pred.max():.4f}]")

# Gain Chart
from causalml.metrics import plot_gain

fig, ax = plt.subplots(figsize=(10, 6))
plot_gain(df_401k['high_wealth'], uplift_pred, df_401k['e401'], ax=ax)
ax.set_title('Cumulative Gain Chart (causalml)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

方法对比总结

维度econmlcausalml
易用性⭐⭐⭐⭐⭐⭐⭐⭐
文档优秀良好
连续结果完全支持需要转换
置信区间内置不支持
算法丰富度高(DML, GRF等)中(Meta-learners)
可视化内置解释工具Uplift曲线
推荐场景严谨研究、政策评估A/B测试、营销

本节小结

完整工作流

python
# 标准Causal Forest工作流

# 1. 数据准备
X = df[feature_columns].values
T = df['treatment'].values.reshape(-1, 1)
Y = df['outcome'].values

# 2. 训练模型
from econml.grf import CausalForest
cf = CausalForest(n_estimators=3000, min_samples_leaf=10)
cf.fit(X, T, Y)

# 3. 预测CATE
tau = cf.predict(X)
tau_interval = cf.predict_interval(X, alpha=0.05)

# 4. 解释异质性
from econml.cate_interpreter import SingleTreeCateInterpreter
interp = SingleTreeCateInterpreter()
interp.interpret(cf, X)
interp.plot(feature_names=feature_columns)

# 5. 政策学习
optimal_policy = (tau > cost_threshold).astype(int)
policy_value = (tau * optimal_policy).mean() - cost * optimal_policy.mean()

# 6. 可视化
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], tau, alpha=0.3)
plt.xlabel('Feature')
plt.ylabel('CATE')
plt.show()

关键要点

  1. 数据准备至关重要

    • 检查缺失值、异常值
    • 理解处理分配机制
    • 考虑选择性偏差
  2. 模型调参

    • n_estimators: 2000-5000
    • min_samples_leaf: 10-20
    • max_depth: None(让树自由生长)
  3. 结果解释

    • 不仅看ATE,更要看异质性
    • 识别高/低受益群体
    • 理解驱动因素(BLP, CATE Interpreter)
  4. 政策含义

    • 量化个性化决策的价值
    • 考虑成本和约束
    • 提供可操作的建议

下一节预告

第5节:高级主题与最新进展中,我们将探讨:

  • Double/Debiased Machine Learning (DML)
  • Generalized Random Forests (GRF)
  • Causal Forest与深度学习的结合
  • 前沿研究和未来方向

从案例到洞察:让因果推断指导实际决策!

基于 MIT 许可证发布。内容版权归作者所有。