Descriptive Statistics and Group Analysis
Replicating Stata's summarize, tabstat, and bysort
Basic Descriptive Statistics
python
import pandas as pd
import numpy as np
df = pd.DataFrame({
'age': [25, 30, 35, 40, 45, 50, 28, 32, 38, 42],
'income': [50000, 60000, 75000, 80000, 90000, 95000, 55000, 65000, 78000, 85000],
'gender': ['M', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F']
})
# Basic statistics
print(df['income'].describe())
# Individual statistics
print(f"Mean: {df['income'].mean()}")
print(f"Median: {df['income'].median()}")
print(f"Standard deviation: {df['income'].std()}")
print(f"Variance: {df['income'].var()}")Stata Command Comparison
summarize
stata
* Stata
summarize income age
summarize income, detailpython
# Pandas
df[['income', 'age']].describe()
# Detailed statistics
df['income'].describe(percentiles=[.1, .25, .5, .75, .9])tabstat
stata
* Stata
tabstat income, by(gender) stat(mean sd median p25 p75)python
# Pandas
df.groupby('gender')['income'].agg([
'mean', 'std', 'median',
('p25', lambda x: x.quantile(0.25)),
('p75', lambda x: x.quantile(0.75))
])bysort
stata
* Stata
bysort gender: summarize incomepython
# Pandas
df.groupby('gender')['income'].describe()Group Analysis
python
# Group by single variable
df.groupby('gender')['income'].agg(['count', 'mean', 'std'])
# Group by multiple variables
df.groupby(['gender', 'education'])['income'].mean()
# Multiple columns, multiple statistics
df.groupby('gender').agg({
'income': ['mean', 'median', 'std'],
'age': ['mean', 'min', 'max']
})Practical Example
Example: Complete Descriptive Statistics Report
python
def generate_report(df):
print("=" * 60)
print("Data Analysis Report")
print("=" * 60)
# Sample size
print(f"\nSample size: {len(df)}")
# Descriptive statistics
print("\n[Descriptive Statistics]")
print(df[['age', 'income']].describe())
# Group by gender
print("\n[Statistics by Gender]")
gender_stats = df.groupby('gender').agg({
'income': ['count', 'mean', 'std'],
'age': 'mean'
})
print(gender_stats)
# Correlation coefficient
print("\n[Correlation Coefficient]")
print(df[['age', 'income']].corr())
generate_report(df)Next Steps
Next section: Web Data Scraping
Keep going!