描述统计与分组分析
复刻 Stata 的 summarize, tabstat, bysort
基本描述统计
python
import pandas as pd
import numpy as np
df = pd.DataFrame({
'age': [25, 30, 35, 40, 45, 50, 28, 32, 38, 42],
'income': [50000, 60000, 75000, 80000, 90000, 95000, 55000, 65000, 78000, 85000],
'gender': ['M', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F']
})
# 基本统计
print(df['income'].describe())
# 单个统计量
print(f"均值: {df['income'].mean()}")
print(f"中位数: {df['income'].median()}")
print(f"标准差: {df['income'].std()}")
print(f"方差: {df['income'].var()}")Stata 命令对照
summarize
stata
* Stata
summarize income age
summarize income, detailpython
# Pandas
df[['income', 'age']].describe()
# 详细统计
df['income'].describe(percentiles=[.1, .25, .5, .75, .9])tabstat
stata
* Stata
tabstat income, by(gender) stat(mean sd median p25 p75)python
# Pandas
df.groupby('gender')['income'].agg([
'mean', 'std', 'median',
('p25', lambda x: x.quantile(0.25)),
('p75', lambda x: x.quantile(0.75))
])bysort
stata
* Stata
bysort gender: summarize incomepython
# Pandas
df.groupby('gender')['income'].describe()分组分析
python
# 按单个变量分组
df.groupby('gender')['income'].agg(['count', 'mean', 'std'])
# 按多个变量分组
df.groupby(['gender', 'education'])['income'].mean()
# 多列多统计
df.groupby('gender').agg({
'income': ['mean', 'median', 'std'],
'age': ['mean', 'min', 'max']
})实战案例
案例:完整的描述统计报告
python
def generate_report(df):
print("=" * 60)
print("数据分析报告")
print("=" * 60)
# 样本量
print(f"\n样本量: {len(df)}")
# 描述统计
print("\n【描述统计】")
print(df[['age', 'income']].describe())
# 按性别分组
print("\n【按性别统计】")
gender_stats = df.groupby('gender').agg({
'income': ['count', 'mean', 'std'],
'age': 'mean'
})
print(gender_stats)
# 相关系数
print("\n【相关系数】")
print(df[['age', 'income']].corr())
generate_report(df)下一步
下一节:网页数据爬取
继续!