6.3 双变量可视化(Bivariate Visualization)
"A picture shows me at a glance what it takes dozens of pages of a book to expound.""一张图片让我一眼就看到了一本书需要几十页才能阐述的内容。"— Ivan Turgenev, Russian Novelist (俄国小说家)
探索变量之间的关系:从相关到因果的第一步
本节目标
完成本节后,你将能够:
- 使用散点图展示两个连续变量的关系
- 识别线性、非线性和条件关系
- 使用分组图表比较不同类别的分布
- 创建和解释相关矩阵热力图
- 使用散点图矩阵进行多变量探索
- 可视化Simpson's Paradox(辛普森悖论)
- 处理大数据可视化问题
两个连续变量的关系
1. 散点图(Scatter Plot)
用途:双变量关系的第一步探索
基础散点图 + 回归线
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress, pearsonr, spearmanr
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
# 生成模拟数据
np.random.seed(42)
n = 500
# 数据生成过程(Data Generating Process)
education = np.random.normal(13, 3, n)
education = np.clip(education, 6, 20)
experience = np.random.uniform(0, 30, n)
ability = np.random.normal(0, 1, n) # 不可观测的能力
# Mincer 工资方程 + 能力偏误
log_wage = (1.5 + 0.08*education + 0.03*experience -
0.0005*experience**2 + 0.15*ability +
np.random.normal(0, 0.3, n))
wage = np.exp(log_wage)
df = pd.DataFrame({
'wage': wage,
'log_wage': log_wage,
'education': education,
'experience': experience,
'ability': ability
})
# 创建对比图:原始 vs 对数转换
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 左图:原始工资(非线性关系)
axes[0].scatter(df['education'], df['wage'], alpha=0.5, s=50, color='steelblue')
slope, intercept, r_value, p_value, std_err = linregress(df['education'], df['wage'])
line_x = np.array([df['education'].min(), df['education'].max()])
line_y = intercept + slope * line_x
axes[0].plot(line_x, line_y, 'r-', linewidth=2, label=f'R² = {r_value**2:.3f}')
axes[0].set_xlabel('教育年限(年)', fontsize=12)
axes[0].set_ylabel('工资(千元/月)', fontsize=12)
axes[0].set_title('教育 vs 工资(Level-Level)', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
# 右图:对数工资(线性关系更强)
axes[1].scatter(df['education'], df['log_wage'], alpha=0.5, s=50, color='coral')
slope2, intercept2, r_value2, p_value2, std_err2 = linregress(df['education'], df['log_wage'])
line_y2 = intercept2 + slope2 * line_x
axes[1].plot(line_x, line_y2, 'r-', linewidth=2, label=f'R² = {r_value2**2:.3f}')
axes[1].set_xlabel('教育年限(年)', fontsize=12)
axes[1].set_ylabel('log(工资)', fontsize=12)
axes[1].set_title('教育 vs log(工资)(Log-Level)', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("模型对比:")
print(f"Level-Level: R² = {r_value**2:.4f}, 斜率 = {slope:.3f}")
print(f"Log-Level: R² = {r_value2**2:.4f}, 斜率 = {slope2:.3f}")
print(f"\n解释: 对数转换后,R² 从 {r_value**2:.3f} 提升到 {r_value2**2:.3f}")
print(f" 教育年限每增加 1 年,工资增长约 {slope2*100:.1f}%")相关系数的计算和显著性检验
python
# 计算 Pearson 和 Spearman 相关系数
pearson_corr, pearson_p = pearsonr(df['education'], df['wage'])
spearman_corr, spearman_p = spearmanr(df['education'], df['wage'])
print("\n相关性分析:")
print("="*60)
print(f"Pearson 相关系数: r = {pearson_corr:.4f}, p = {pearson_p:.4e}")
print(f"Spearman 相关系数: ρ = {spearman_corr:.4f}, p = {spearman_p:.4e}")
if pearson_p < 0.001:
print("\n结论: 教育与工资在 0.001 水平上显著正相关")
else:
print("\n结论: 未发现显著相关")
# 可视化不同强度的相关性
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
correlations = [0.3, 0.6, 0.9]
for i, target_corr in enumerate(correlations):
# 生成指定相关系数的数据
x = np.random.randn(200)
y = target_corr * x + np.sqrt(1 - target_corr**2) * np.random.randn(200)
axes[i].scatter(x, y, alpha=0.5, s=40)
# 添加回归线
slope, intercept, r_value, _, _ = linregress(x, y)
x_line = np.array([x.min(), x.max()])
y_line = intercept + slope * x_line
axes[i].plot(x_line, y_line, 'r-', linewidth=2)
axes[i].set_title(f'r = {target_corr:.1f} (R² = {target_corr**2:.2f})',
fontsize=14, fontweight='bold')
axes[i].set_xlabel('X')
axes[i].set_ylabel('Y')
axes[i].grid(True, alpha=0.3)
plt.suptitle('不同强度的线性相关', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()2. 非线性关系的可视化
案例:经验-工资的倒U型关系
python
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# 图1:散点图
axes[0].scatter(df['experience'], df['wage'], alpha=0.5, s=50, color='steelblue')
axes[0].set_xlabel('工作经验(年)', fontsize=12)
axes[0].set_ylabel('工资(千元/月)', fontsize=12)
axes[0].set_title('经验 vs 工资(原始数据)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)
# 图2:添加线性拟合(不合适)
axes[1].scatter(df['experience'], df['wage'], alpha=0.5, s=50, color='steelblue')
slope, intercept, r_value, _, _ = linregress(df['experience'], df['wage'])
x_line = np.linspace(df['experience'].min(), df['experience'].max(), 100)
y_line = intercept + slope * x_line
axes[1].plot(x_line, y_line, 'r-', linewidth=2, label=f'线性拟合 (R² = {r_value**2:.3f})')
axes[1].set_xlabel('工作经验(年)', fontsize=12)
axes[1].set_ylabel('工资(千元/月)', fontsize=12)
axes[1].set_title('线性拟合(不合适)', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# 图3:添加二次拟合(更合适)
from numpy.polynomial import polynomial as P
axes[2].scatter(df['experience'], df['wage'], alpha=0.5, s=50, color='steelblue')
# 二次多项式拟合
coeffs = np.polyfit(df['experience'], df['wage'], 2)
poly = np.poly1d(coeffs)
y_poly = poly(x_line)
axes[2].plot(x_line, y_poly, 'r-', linewidth=2, label='二次拟合')
# 计算二次模型的 R²
residuals = df['wage'] - poly(df['experience'])
ss_res = np.sum(residuals**2)
ss_tot = np.sum((df['wage'] - df['wage'].mean())**2)
r2_poly = 1 - (ss_res / ss_tot)
axes[2].set_xlabel('工作经验(年)', fontsize=12)
axes[2].set_ylabel('工资(千元/月)', fontsize=12)
axes[2].set_title(f'二次拟合(R² = {r2_poly:.3f})', fontsize=14, fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\n模型对比:")
print(f"线性模型 R²: {r_value**2:.4f}")
print(f"二次模型 R²: {r2_poly:.4f}")
print(f"R² 提升: {(r2_poly - r_value**2)*100:.2f} 个百分点")LOWESS 平滑曲线(局部加权回归)
python
from statsmodels.nonparametric.smoothers_lowess import lowess
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 图1:散点图 + LOWESS
axes[0].scatter(df['experience'], df['log_wage'], alpha=0.4, s=40, color='lightgray')
# LOWESS 曲线
smoothed = lowess(df['log_wage'], df['experience'], frac=0.3)
axes[0].plot(smoothed[:, 0], smoothed[:, 1], 'r-', linewidth=3, label='LOWESS 曲线')
axes[0].set_xlabel('工作经验(年)', fontsize=12)
axes[0].set_ylabel('log(工资)', fontsize=12)
axes[0].set_title('LOWESS 平滑:识别非线性关系', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
# 图2:不同平滑度的对比
axes[1].scatter(df['experience'], df['log_wage'], alpha=0.3, s=30, color='lightgray')
for frac, color, label in [(0.1, 'red', '过拟合 (frac=0.1)'),
(0.3, 'blue', '适中 (frac=0.3)'),
(0.6, 'green', '过平滑 (frac=0.6)')]:
smoothed = lowess(df['log_wage'], df['experience'], frac=frac)
axes[1].plot(smoothed[:, 0], smoothed[:, 1], color=color, linewidth=2, label=label)
axes[1].set_xlabel('工作经验(年)', fontsize=12)
axes[1].set_ylabel('log(工资)', fontsize=12)
axes[1].set_title('不同 LOWESS 参数的影响', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()3. 大数据可视化:六边形图和等高线图
python
# 生成大量数据
np.random.seed(123)
n_large = 10000
edu_large = np.random.normal(13, 3, n_large)
exp_large = np.random.uniform(0, 30, n_large)
log_wage_large = (1.5 + 0.08*edu_large + 0.03*exp_large -
0.0005*exp_large**2 + np.random.normal(0, 0.3, n_large))
wage_large = np.exp(log_wage_large)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 图1:普通散点图(重叠严重)
axes[0, 0].scatter(edu_large, wage_large, alpha=0.1, s=10, color='steelblue')
axes[0, 0].set_title(f'散点图(N={n_large:,},重叠严重)', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('教育年限(年)')
axes[0, 0].set_ylabel('工资(千元/月)')
axes[0, 0].grid(True, alpha=0.3)
# 图2:六边形图
hb = axes[0, 1].hexbin(edu_large, wage_large, gridsize=40, cmap='YlOrRd', mincnt=1)
axes[0, 1].set_title('六边形图(密度清晰)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('教育年限(年)')
axes[0, 1].set_ylabel('工资(千元/月)')
plt.colorbar(hb, ax=axes[0, 1], label='数据点数量')
# 图3:2D 直方图
h = axes[1, 0].hist2d(edu_large, wage_large, bins=50, cmap='Blues')
axes[1, 0].set_title('2D 直方图', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('教育年限(年)')
axes[1, 0].set_ylabel('工资(千元/月)')
plt.colorbar(h[3], ax=axes[1, 0], label='频数')
# 图4:等高线图
from scipy.stats import gaussian_kde
kde = gaussian_kde([edu_large, wage_large])
xi, yi = np.mgrid[edu_large.min():edu_large.max():100j,
wage_large.min():wage_large.max():100j]
zi = kde(np.vstack([xi.flatten(), yi.flatten()]))
axes[1, 1].contourf(xi, yi, zi.reshape(xi.shape), levels=15, cmap='viridis')
axes[1, 1].contour(xi, yi, zi.reshape(xi.shape), levels=15, colors='white',
linewidths=0.5, alpha=0.5)
axes[1, 1].set_title('等高线密度图', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('教育年限(年)')
axes[1, 1].set_ylabel('工资(千元/月)')
plt.tight_layout()
plt.show()连续变量 vs 分类变量
1. 分组箱线图和小提琴图
python
# 添加分类变量
df['female'] = np.random.binomial(1, 0.5, len(df))
df['gender'] = df['female'].map({0: '男性', 1: '女性'})
df['region'] = np.random.choice(['东部', '中部', '西部'], len(df))
# 调整工资(引入性别差距)
df.loc[df['female'] == 1, 'log_wage'] -= 0.15
df['wage'] = np.exp(df['log_wage'])
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
# 第一行:性别对比
# 图1:箱线图
sns.boxplot(x='gender', y='wage', data=df, ax=axes[0, 0], palette=['lightblue', 'lightcoral'])
axes[0, 0].set_title('性别工资差距(箱线图)', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('工资(千元/月)')
axes[0, 0].grid(True, alpha=0.3, axis='y')
# 图2:小提琴图
sns.violinplot(x='gender', y='wage', data=df, ax=axes[0, 1], palette=['lightblue', 'lightcoral'])
axes[0, 1].set_title('性别工资差距(小提琴图)', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('工资(千元/月)')
axes[0, 1].grid(True, alpha=0.3, axis='y')
# 图3:带状图(Swarm Plot,样本量小时使用)
sample_df = df.sample(100, random_state=42)
sns.violinplot(x='gender', y='wage', data=sample_df, ax=axes[0, 2],
palette=['lightblue', 'lightcoral'], inner=None, alpha=0.3)
sns.swarmplot(x='gender', y='wage', data=sample_df, ax=axes[0, 2],
color='black', alpha=0.5, size=3)
axes[0, 2].set_title('性别工资差距(带状图)', fontsize=14, fontweight='bold')
axes[0, 2].set_ylabel('工资(千元/月)')
axes[0, 2].grid(True, alpha=0.3, axis='y')
# 第二行:地区对比
# 图4:箱线图
sns.boxplot(x='region', y='wage', data=df, ax=axes[1, 0],
order=['东部', '中部', '西部'])
axes[1, 0].set_title('地区工资差距(箱线图)', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('工资(千元/月)')
axes[1, 0].grid(True, alpha=0.3, axis='y')
# 图5:小提琴图
sns.violinplot(x='region', y='wage', data=df, ax=axes[1, 1],
order=['东部', '中部', '西部'])
axes[1, 1].set_title('地区工资差距(小提琴图)', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('工资(千元/月)')
axes[1, 1].grid(True, alpha=0.3, axis='y')
# 图6:点图(Point Plot,显示均值和置信区间)
sns.pointplot(x='region', y='wage', data=df, ax=axes[1, 2],
order=['东部', '中部', '西部'], capsize=0.2, errwidth=2)
axes[1, 2].set_title('地区工资差距(点图,均值±SE)', fontsize=14, fontweight='bold')
axes[1, 2].set_ylabel('工资(千元/月)')
axes[1, 2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
# 统计检验
from scipy.stats import ttest_ind, f_oneway
# t 检验(性别)
male_wage = df[df['gender'] == '男性']['wage']
female_wage = df[df['gender'] == '女性']['wage']
t_stat, t_p = ttest_ind(male_wage, female_wage)
print("\n性别工资差距检验:")
print("="*60)
print(f"男性平均工资: {male_wage.mean():.2f} 千元")
print(f"女性平均工资: {female_wage.mean():.2f} 千元")
print(f"差异: {male_wage.mean() - female_wage.mean():.2f} 千元 ({(male_wage.mean() / female_wage.mean() - 1)*100:.1f}%)")
print(f"t 检验: t = {t_stat:.3f}, p = {t_p:.4e}")
# ANOVA(地区)
east = df[df['region'] == '东部']['wage']
central = df[df['region'] == '中部']['wage']
west = df[df['region'] == '西部']['wage']
f_stat, f_p = f_oneway(east, central, west)
print("\n地区工资差距检验(ANOVA):")
print("="*60)
print(f"东部平均工资: {east.mean():.2f} 千元")
print(f"中部平均工资: {central.mean():.2f} 千元")
print(f"西部平均工资: {west.mean():.2f} 千元")
print(f"F 检验: F = {f_stat:.3f}, p = {f_p:.4e}")2. 条件关系:分组散点图
python
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# 图1:按性别分组
for gender, color, marker in [('男性', 'blue', 'o'), ('女性', 'red', 's')]:
mask = df['gender'] == gender
axes[0].scatter(df.loc[mask, 'education'], df.loc[mask, 'wage'],
alpha=0.4, s=50, color=color, marker=marker, label=gender)
# 分别拟合回归线
x = df.loc[mask, 'education']
y = df.loc[mask, 'wage']
slope, intercept, r_value, _, _ = linregress(x, y)
x_line = np.array([x.min(), x.max()])
y_line = intercept + slope * x_line
axes[0].plot(x_line, y_line, color=color, linewidth=2, linestyle='--')
axes[0].set_xlabel('教育年限(年)', fontsize=12)
axes[0].set_ylabel('工资(千元/月)', fontsize=12)
axes[0].set_title('教育-工资关系的性别差异(平行性检验)', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
# 图2:使用 seaborn 的 lmplot 风格
sns.scatterplot(x='education', y='log_wage', hue='gender', style='gender',
data=df, ax=axes[1], alpha=0.5, s=50)
sns.regplot(x='education', y='log_wage', data=df[df['gender'] == '男性'],
ax=axes[1], scatter=False, color='blue', line_kws={'linewidth': 2})
sns.regplot(x='education', y='log_wage', data=df[df['gender'] == '女性'],
ax=axes[1], scatter=False, color='red', line_kws={'linewidth': 2})
axes[1].set_xlabel('教育年限(年)', fontsize=12)
axes[1].set_ylabel('log(工资)', fontsize=12)
axes[1].set_title('教育-log(工资)关系的性别差异', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Simpson's Paradox(辛普森悖论)
定义:整体趋势与分组趋势相反的现象
python
# 生成 Simpson 悖论数据
np.random.seed(999)
# 三个学校,不同的教学质量和生源质量
schools = []
for school, base_score, slope in [('名校', 80, -0.5),
('普校', 60, -0.5),
('差校', 40, -0.5)]:
n_school = 100
study_hours = np.random.uniform(1, 10, n_school)
scores = base_score + slope * study_hours + np.random.normal(0, 5, n_school)
schools.append(pd.DataFrame({
'study_hours': study_hours,
'score': scores,
'school': school
}))
df_simpson = pd.concat(schools, ignore_index=True)
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# 图1:整体趋势(正相关)
axes[0].scatter(df_simpson['study_hours'], df_simpson['score'], alpha=0.5, s=50)
slope_all, intercept_all, r_all, _, _ = linregress(df_simpson['study_hours'],
df_simpson['score'])
x_line = np.array([df_simpson['study_hours'].min(), df_simpson['study_hours'].max()])
y_line = intercept_all + slope_all * x_line
axes[0].plot(x_line, y_line, 'r-', linewidth=2,
label=f'整体: 斜率={slope_all:.3f}')
axes[0].set_xlabel('学习时长(小时/天)', fontsize=12)
axes[0].set_ylabel('考试成绩', fontsize=12)
axes[0].set_title('整体趋势:学习时间越长,成绩越好?', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 图2:分组趋势(负相关)
colors = {'名校': 'red', '普校': 'blue', '差校': 'green'}
for school, color in colors.items():
mask = df_simpson['school'] == school
x = df_simpson.loc[mask, 'study_hours']
y = df_simpson.loc[mask, 'score']
axes[1].scatter(x, y, alpha=0.5, s=50, color=color, label=school)
# 分组回归线
slope, intercept, _, _, _ = linregress(x, y)
x_line = np.array([x.min(), x.max()])
y_line = intercept + slope * x_line
axes[1].plot(x_line, y_line, color=color, linewidth=2, linestyle='--',
label=f'{school}: 斜率={slope:.3f}')
axes[1].set_xlabel('学习时长(小时/天)', fontsize=12)
axes[1].set_ylabel('考试成绩', fontsize=12)
axes[1].set_title('分组趋势:控制学校后,学习时间越长,成绩越差!', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=10, loc='upper right')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nSimpson's Paradox 演示:")
print("="*60)
print(f"整体相关系数: {pearsonr(df_simpson['study_hours'], df_simpson['score'])[0]:.3f}")
for school in ['名校', '普校', '差校']:
mask = df_simpson['school'] == school
corr = pearsonr(df_simpson.loc[mask, 'study_hours'],
df_simpson.loc[mask, 'score'])[0]
print(f"{school}内相关系数: {corr:.3f}")
print("\n结论: 忽略混淆变量(学校)会导致错误的因果推断!")相关性分析与可视化
1. 相关矩阵热力图(进阶版)
python
# 选择多个变量
analysis_vars = ['wage', 'education', 'experience', 'log_wage']
df_corr = df[analysis_vars].copy()
# 计算相关矩阵和 p 值
from scipy.stats import pearsonr
def corrfunc(x, y):
"""计算相关系数和 p 值"""
return pearsonr(x, y)
# 计算相关矩阵
corr_matrix = df_corr.corr()
# 计算 p 值矩阵
n_vars = len(analysis_vars)
p_matrix = np.zeros((n_vars, n_vars))
for i, var1 in enumerate(analysis_vars):
for j, var2 in enumerate(analysis_vars):
if i != j:
_, p = pearsonr(df_corr[var1], df_corr[var2])
p_matrix[i, j] = p
# 绘制热力图
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# 图1:相关系数
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[0],
vmin=-1, vmax=1)
axes[0].set_title('相关系数矩阵', fontsize=14, fontweight='bold')
# 图2:相关系数 + 显著性标记
mask = p_matrix > 0.05 # 不显著的位置
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[1],
vmin=-1, vmax=1, mask=mask)
axes[1].set_title('相关系数矩阵(只显示 p<0.05)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# 打印详细信息
print("\n相关系数矩阵(带显著性):")
print("="*60)
for i, var1 in enumerate(analysis_vars):
for j, var2 in enumerate(analysis_vars):
if i < j: # 只打印上三角
corr = corr_matrix.iloc[i, j]
p = p_matrix[i, j]
sig = '***' if p < 0.001 else ('**' if p < 0.01 else ('*' if p < 0.05 else ''))
print(f"{var1:12s} vs {var2:12s}: r = {corr:6.3f}{sig:3s} (p = {p:.4f})")2. 散点图矩阵(Pair Plot)
python
# 创建增强版散点图矩阵
plot_vars = ['wage', 'education', 'experience', 'log_wage']
df_plot = df[plot_vars + ['gender']].copy()
# 使用 seaborn 的 pairplot
g = sns.pairplot(df_plot, hue='gender', diag_kind='kde',
plot_kws={'alpha': 0.5, 's': 30},
diag_kws={'linewidth': 2},
height=3, aspect=1)
# 添加相关系数
def corrfunc_plot(x, y, **kwargs):
"""在子图上添加相关系数"""
r, p = pearsonr(x, y)
ax = plt.gca()
ax.annotate(f'r = {r:.3f}', xy=(0.1, 0.9), xycoords=ax.transAxes,
fontsize=11, fontweight='bold')
g.map_lower(corrfunc_plot)
plt.suptitle('散点图矩阵:多变量关系探索', y=1.01, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()实战案例:劳动力市场的完整双变量分析
python
# 完整的双变量探索性分析
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
# 1. 教育-工资散点图
ax1 = fig.add_subplot(gs[0, 0])
ax1.scatter(df['education'], df['log_wage'], alpha=0.4, s=30, c=df['female'],
cmap='RdBu_r')
sns.regplot(x='education', y='log_wage', data=df, ax=ax1, scatter=False, color='black')
ax1.set_xlabel('教育年限(年)')
ax1.set_ylabel('log(工资)')
ax1.set_title('教育 vs 工资')
ax1.grid(True, alpha=0.3)
# 2. 经验-工资散点图
ax2 = fig.add_subplot(gs[0, 1])
ax2.scatter(df['experience'], df['log_wage'], alpha=0.4, s=30, c=df['female'],
cmap='RdBu_r')
smoothed = lowess(df['log_wage'], df['experience'], frac=0.3)
ax2.plot(smoothed[:, 0], smoothed[:, 1], 'k-', linewidth=2)
ax2.set_xlabel('工作经验(年)')
ax2.set_ylabel('log(工资)')
ax2.set_title('经验 vs 工资(LOWESS)')
ax2.grid(True, alpha=0.3)
# 3. 相关矩阵
ax3 = fig.add_subplot(gs[0, 2])
corr_mini = df[['log_wage', 'education', 'experience']].corr()
sns.heatmap(corr_mini, annot=True, fmt='.3f', cmap='coolwarm', center=0,
square=True, cbar_kws={"shrink": 0.8}, ax=ax3, vmin=-1, vmax=1)
ax3.set_title('相关矩阵')
# 4. 性别工资分布
ax4 = fig.add_subplot(gs[1, 0])
sns.violinplot(x='gender', y='log_wage', data=df, ax=ax4, inner='box')
ax4.set_ylabel('log(工资)')
ax4.set_title('性别工资差距')
ax4.grid(True, alpha=0.3, axis='y')
# 5. 地区工资分布
ax5 = fig.add_subplot(gs[1, 1])
sns.boxplot(x='region', y='log_wage', data=df, ax=ax5, order=['东部', '中部', '西部'])
ax5.set_ylabel('log(工资)')
ax5.set_title('地区工资差距')
ax5.grid(True, alpha=0.3, axis='y')
# 6. 教育分布(按性别)
ax6 = fig.add_subplot(gs[1, 2])
sns.histplot(data=df, x='education', hue='gender', kde=True, ax=ax6, bins=20)
ax6.set_xlabel('教育年限(年)')
ax6.set_title('教育分布(按性别)')
ax6.grid(True, alpha=0.3, axis='y')
# 7. 分组回归:教育-工资(按性别)
ax7 = fig.add_subplot(gs[2, :])
for gender, color in [('男性', 'blue'), ('女性', 'red')]:
mask = df['gender'] == gender
ax7.scatter(df.loc[mask, 'education'], df.loc[mask, 'log_wage'],
alpha=0.3, s=30, color=color, label=gender)
# 回归线
x = df.loc[mask, 'education']
y = df.loc[mask, 'log_wage']
slope, intercept, r_value, _, _ = linregress(x, y)
x_line = np.linspace(x.min(), x.max(), 100)
y_line = intercept + slope * x_line
ax7.plot(x_line, y_line, color=color, linewidth=2,
label=f'{gender}: 斜率={slope:.4f}, R²={r_value**2:.3f}')
ax7.set_xlabel('教育年限(年)')
ax7.set_ylabel('log(工资)')
ax7.set_title('教育回报率的性别差异(交互效应)')
ax7.legend(fontsize=10)
ax7.grid(True, alpha=0.3)
plt.suptitle('劳动力市场的双变量探索性分析', fontsize=16, fontweight='bold', y=0.995)
plt.show()
# 生成分析报告
print("\n劳动力市场双变量分析报告")
print("="*70)
print(f"样本量: {len(df):,}")
print(f"\n平均工资:")
print(f" 整体: {df['wage'].mean():.2f} 千元")
print(f" 男性: {df[df['gender']=='男性']['wage'].mean():.2f} 千元")
print(f" 女性: {df[df['gender']=='女性']['wage'].mean():.2f} 千元")
print(f" 性别差距: {(df[df['gender']=='男性']['wage'].mean() / df[df['gender']=='女性']['wage'].mean() - 1)*100:.1f}%")
print(f"\n教育回报率(log-level 模型):")
for gender in ['男性', '女性']:
mask = df['gender'] == gender
slope, _, r2, p, _ = linregress(df.loc[mask, 'education'], df.loc[mask, 'log_wage'])
print(f" {gender}: {slope*100:.2f}% per year (R²={r2:.3f}, p={p:.2e})")本节小结
图表选择决策树
两个变量的可视化
├─ 都是连续变量?
│ ├─ 样本量 < 1000?
│ │ └─ 散点图 + 回归线 (sns.regplot)
│ └─ 样本量 ≥ 1000?
│ ├─ 六边形图 (hexbin)
│ └─ 2D 直方图 (hist2d)
│
├─ 一个连续 + 一个分类?
│ ├─ 关注分布?
│ │ ├─ 箱线图 (boxplot)
│ │ └─ 小提琴图 (violinplot)
│ └─ 关注关系?
│ └─ 分组散点图 + 分组回归线
│
└─ 多个连续变量?
├─ 相关矩阵热力图 (heatmap)
└─ 散点图矩阵 (pairplot)关键要点
相关 ≠ 因果:
- 散点图显示相关性
- 因果推断需要控制混淆变量
- 警惕 Simpson's Paradox
数据转换的重要性:
- 右偏分布 → log 转换
- 非线性关系 → 多项式或 LOWESS
分组分析:
- 检验交互效应
- 识别异质性
- 控制混淆因素
大数据可视化:
- N > 1000:使用六边形图或等高线图
- 避免过度绘制(overplotting)
下节预告
在下一节中,我们将深入学习回归分析的可视化,包括残差诊断、影响力分析等高级技巧。
从双变量到多变量,从相关到因果!