数据可视化

Matplotlib 与 Seaborn —— Python 的绘图工具

两大可视化库

Matplotlib：基础绘图库（类似 Stata graph）
Seaborn：基于 Matplotlib 的高级库（类似 R ggplot2）

Matplotlib 基础

安装与导入

python

import matplotlib.pyplot as plt
import numpy as np

# 查看版本
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"NumPy version: {np.__version__}")

基本绘图

python

import matplotlib.pyplot as plt
import numpy as np

# 数据
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]

# 创建 2x2 子图展示四种基本图表
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# 折线图
axes[0, 0].plot(x, y, marker='o')
axes[0, 0].set_xlabel('X轴')
axes[0, 0].set_ylabel('Y轴')
axes[0, 0].set_title('折线图')

# 散点图
axes[0, 1].scatter(x, y, s=100, c='orange')
axes[0, 1].set_title('散点图')

# 柱状图
axes[1, 0].bar(x, y, color='green', edgecolor='black')
axes[1, 0].set_title('柱状图')

# 直方图
data = np.random.randn(1000)
axes[1, 1].hist(data, bins=30, edgecolor='black')
axes[1, 1].set_title('直方图')

plt.tight_layout()
plt.show()

完整可运行示例（直接复制运行）：

python

import matplotlib.pyplot as plt
import numpy as np

# 折线图
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]
plt.plot(x, y)
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.title('简单折线图')
plt.show()

# 直方图
data = np.random.randn(1000)
plt.hist(data, bins=30, edgecolor='black')
plt.title('随机数据直方图')
plt.show()

实战案例

案例 1：收入分布

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 模拟数据
np.random.seed(42)
incomes = np.random.lognormal(10.8, 0.5, 1000) * 1000

# 创建图形
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# 左图：直方图
axes[0].hist(incomes, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Income ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Income Distribution')

# 右图：箱线图
axes[1].boxplot(incomes)
axes[1].set_ylabel('Income ($)')
axes[1].set_title('Income Boxplot')

plt.tight_layout()
plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import matplotlib.pyplot as plt

# 模拟收入数据（对数正态分布）
np.random.seed(42)
incomes = np.random.lognormal(10.8, 0.5, 1000) * 1000

# 创建 1 行 2 列子图
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# 左图：直方图
axes[0].hist(incomes, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Income ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Income Distribution')

# 右图：箱线图
axes[1].boxplot(incomes)
axes[1].set_ylabel('Income ($)')
axes[1].set_title('Income Boxplot')

plt.tight_layout()
plt.show()

案例 2：年龄-收入关系

python

import numpy as np
import matplotlib.pyplot as plt

# 数据
np.random.seed(123)
ages = np.random.randint(22, 65, 200)
incomes = 20000 + ages * 1500 + np.random.normal(0, 10000, 200)

# 散点图 + 拟合线
plt.figure(figsize=(10, 6))
plt.scatter(ages, incomes, alpha=0.5)

# 添加拟合线
z = np.polyfit(ages, incomes, 1)
p = np.poly1d(z)
plt.plot(ages, p(ages), "r--", linewidth=2)

plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)
plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import matplotlib.pyplot as plt

# 模拟年龄-收入数据
np.random.seed(123)
ages = np.random.randint(22, 65, 200)
incomes = 20000 + ages * 1500 + np.random.normal(0, 10000, 200)

# 散点图 + 拟合线
plt.figure(figsize=(10, 6))
plt.scatter(ages, incomes, alpha=0.5, label='Data')

# 添加线性拟合线
z = np.polyfit(ages, incomes, 1)
p = np.poly1d(z)
plt.plot(sorted(ages), p(sorted(ages)), "r--", linewidth=2, label='Fit')

plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Age vs Income')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Seaborn：更美观的图表

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")  # 设置风格

# 创建数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'gender': np.random.choice(['M', 'F'], 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

print(df.head())
print(f"\n数据形状: {df.shape}")

常用图表

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# 创建数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'gender': np.random.choice(['M', 'F'], 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

# 创建 2x2 子图展示常用图表
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1. 散点图 + 回归线
sns.regplot(data=df, x='age', y='income', ax=axes[0, 0])
axes[0, 0].set_title('散点图 + 回归线')

# 2. 分组箱线图
sns.boxplot(data=df, x='education', y='income', hue='gender', ax=axes[0, 1])
axes[0, 1].set_title('分组箱线图')

# 3. 分布图
sns.histplot(data=df, x='income', hue='gender', kde=True, ax=axes[1, 0])
axes[1, 0].set_title('分布图')

# 4. 小提琴图
sns.violinplot(data=df, x='education', y='income', ax=axes[1, 1])
axes[1, 1].set_title('小提琴图')

plt.tight_layout()
plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 设置风格
sns.set_style("whitegrid")

# 创建模拟数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'gender': np.random.choice(['M', 'F'], 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

# 1. 散点图 + 回归线
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.regplot(data=df, x='age', y='income', ax=axes[0])
axes[0].set_title('Age vs Income with Regression Line')

# 2. 分组箱线图
sns.boxplot(data=df, x='education', y='income', hue='gender', ax=axes[1])
axes[1].set_title('Income by Education and Gender')

# 3. 分布图（带核密度估计）
sns.histplot(data=df, x='income', hue='gender', kde=True, ax=axes[2])
axes[2].set_title('Income Distribution by Gender')

plt.tight_layout()
plt.show()

Stata 图表对比

Stata: 散点图

stata

* Stata
twoway scatter income age

Python: 等价代码

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 创建数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 100),
    'income': np.random.normal(60000, 20000, 100)
})

# 创建 1x2 子图对比两种方式
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Matplotlib 方式
axes[0].scatter(df['age'], df['income'])
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Income')
axes[0].set_title('Matplotlib')

# Seaborn 方式（更简单）
sns.scatterplot(data=df, x='age', y='income', ax=axes[1])
axes[1].set_title('Seaborn')

plt.tight_layout()
plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 创建模拟数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 100),
    'income': np.random.normal(60000, 20000, 100)
})

# Matplotlib 方式
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(df['age'], df['income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Matplotlib Scatter')

# Seaborn 方式（更简洁）
plt.subplot(1, 2, 2)
sns.scatterplot(data=df, x='age', y='income')
plt.title('Seaborn Scatter')

plt.tight_layout()
plt.show()

实用技巧

子图布局

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 创建数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

# 创建 2x2 子图
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 左上：年龄分布
axes[0, 0].hist(df['age'], bins=20, edgecolor='black')
axes[0, 0].set_title('Age Distribution')

# 右上：收入分布
axes[0, 1].hist(df['income'], bins=20, edgecolor='black')
axes[0, 1].set_title('Income Distribution')

# 左下：年龄 vs 收入散点图
axes[1, 0].scatter(df['age'], df['income'], alpha=0.5)
axes[1, 0].set_title('Age vs Income')

# 右下：按教育水平分组的收入箱线图
axes[1, 1].boxplot([
    df[df['education']=='HS']['income'],
    df[df['education']=='BA']['income'],
    df[df['education']=='MA']['income']
])
axes[1, 1].set_xticklabels(['HS', 'BA', 'MA'])
axes[1, 1].set_title('Income by Education')

plt.tight_layout()
plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 创建模拟数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

# 创建 2x2 子图
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 左上：年龄分布
axes[0, 0].hist(df['age'], bins=20, edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')

# 右上：收入分布
axes[0, 1].hist(df['income'], bins=20, edgecolor='black', color='orange')
axes[0, 1].set_title('Income Distribution')
axes[0, 1].set_xlabel('Income')

# 左下：年龄 vs 收入散点图
axes[1, 0].scatter(df['age'], df['income'], alpha=0.5)
axes[1, 0].set_title('Age vs Income')
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Income')

# 右下：按教育水平分组的收入箱线图
axes[1, 1].boxplot([
    df[df['education']=='HS']['income'],
    df[df['education']=='BA']['income'],
    df[df['education']=='MA']['income']
])
axes[1, 1].set_xticklabels(['HS', 'BA', 'MA'])
axes[1, 1].set_title('Income by Education')
axes[1, 1].set_ylabel('Income')

plt.tight_layout()
plt.show()

保存图表

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 创建数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 100),
    'income': np.random.normal(60000, 20000, 100)
})

# 创建图表
plt.figure(figsize=(10, 6))
plt.scatter(df['age'], df['income'], alpha=0.6)
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)

# 保存（取消注释以保存到文件）
# plt.savefig('age_income.png', dpi=300, bbox_inches='tight')
# plt.savefig('age_income.pdf')  # 矢量图（论文用）

plt.show()

完整可运行示例（直接复制运行）：

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 创建模拟数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 100),
    'income': np.random.normal(60000, 20000, 100)
})

# 绑定风格并创建图表
plt.figure(figsize=(10, 6))
plt.scatter(df['age'], df['income'], alpha=0.6, edgecolors='black')
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)

# 保存图表（取消注释以保存到文件）
# plt.savefig('age_income.png', dpi=300, bbox_inches='tight')
# plt.savefig('age_income.pdf')  # 矢量图（论文用）

plt.show()

练习题

python

# 练习任务：
# 使用提供的数据创建：
# 1. 收入分布直方图（按性别分面）
# 2. 年龄-收入散点图（按教育水平着色）
# 3. 各教育水平的收入箱线图
# 4. 保存所有图表

print("请参考下方完整可运行示例完成练习！")

完整可运行练习模板（直接复制运行）：

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 设置风格
sns.set_style("whitegrid")

# 创建模拟数据
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'gender': np.random.choice(['M', 'F'], 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

# 创建 2x2 子图布局
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 收入分布直方图（按性别：男）
axes[0, 0].hist(df[df['gender']=='M']['income'], bins=20, edgecolor='black', color='steelblue')
axes[0, 0].set_title('Income Distribution (Male)')
axes[0, 0].set_xlabel('Income')

# 收入分布直方图（按性别：女）
axes[0, 1].hist(df[df['gender']=='F']['income'], bins=20, edgecolor='black', color='coral')
axes[0, 1].set_title('Income Distribution (Female)')
axes[0, 1].set_xlabel('Income')

# 2. 年龄-收入散点图（按教育水平着色）
colors = {'HS': 'red', 'BA': 'blue', 'MA': 'green'}
for edu in ['HS', 'BA', 'MA']:
    subset = df[df['education'] == edu]
    axes[1, 0].scatter(subset['age'], subset['income'], 
                       c=colors[edu], label=edu, alpha=0.6)
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Income')
axes[1, 0].set_title('Age vs Income by Education')
axes[1, 0].legend()

# 3. 各教育水平的收入箱线图
sns.boxplot(data=df, x='education', y='income', order=['HS', 'BA', 'MA'], ax=axes[1, 1])
axes[1, 1].set_title('Income by Education Level')

plt.tight_layout()
plt.show()

下一步

下一节：描述统计与分组分析

继续！

计量经济学可视化

时间序列分析

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 模拟时间序列数据
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', periods=100)
data = np.cumsum(np.random.randn(100))

# 绘制时间序列图
plt.figure(figsize=(10, 6))
plt.plot(dates, data, label='Time Series Data')
plt.axhline(0, color='red', linestyle='--', linewidth=1, label='Zero Line')
plt.title('Time Series Visualization')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

回归诊断图

python

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# 模拟数据
np.random.seed(42)
x = np.random.normal(size=100)
y = 2 * x + np.random.normal(size=100)

# 回归模型
X = sm.add_constant(x)
model = sm.OLS(y, X).fit()

# 打印回归结果摘要
print(model.summary())

# 诊断图
fig = sm.graphics.plot_regress_exog(model, 'x1', fig=plt.figure(figsize=(12, 8)))
plt.tight_layout()
plt.show()

面板数据可视化

python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 模拟面板数据
np.random.seed(42)
data = pd.DataFrame({
    'id': np.repeat(range(1, 6), 10),
    'time': list(range(1, 11)) * 5,
    'value': np.random.randn(50).cumsum() + np.tile(range(5), 10)
})

print("面板数据示例：")
print(data.head(15))

# 面板数据折线图
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='time', y='value', hue='id', palette='tab10')
plt.title('Panel Data Visualization')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(title='ID')
plt.grid(alpha=0.3)
plt.show()

数据可视化 ​

两大可视化库 ​

Matplotlib 基础 ​

安装与导入 ​

基本绘图 ​

实战案例 ​

案例 1：收入分布 ​

案例 2：年龄-收入关系 ​

Seaborn：更美观的图表 ​

常用图表 ​

Stata 图表对比 ​

Stata: 散点图 ​

Python: 等价代码 ​

实用技巧 ​

子图布局 ​

保存图表 ​

练习题 ​

下一步 ​

计量经济学可视化 ​

时间序列分析 ​

回归诊断图 ​

面板数据可视化 ​

数据可视化

两大可视化库

Matplotlib 基础

安装与导入

基本绘图

实战案例

案例 1：收入分布

案例 2：年龄-收入关系

Seaborn：更美观的图表

常用图表

Stata 图表对比

Stata: 散点图

Python: 等价代码

实用技巧

子图布局

保存图表

练习题

下一步

计量经济学可视化

时间序列分析

回归诊断图

面板数据可视化