Skip to content

Descriptive Statistics and Group Analysis

Replicating Stata's summarize, tabstat, and bysort


Basic Descriptive Statistics

python
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'age': [25, 30, 35, 40, 45, 50, 28, 32, 38, 42],
    'income': [50000, 60000, 75000, 80000, 90000, 95000, 55000, 65000, 78000, 85000],
    'gender': ['M', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F']
})

# Basic statistics
print(df['income'].describe())

# Individual statistics
print(f"Mean: {df['income'].mean()}")
print(f"Median: {df['income'].median()}")
print(f"Standard deviation: {df['income'].std()}")
print(f"Variance: {df['income'].var()}")

Stata Command Comparison

summarize

stata
* Stata
summarize income age
summarize income, detail
python
# Pandas
df[['income', 'age']].describe()

# Detailed statistics
df['income'].describe(percentiles=[.1, .25, .5, .75, .9])

tabstat

stata
* Stata
tabstat income, by(gender) stat(mean sd median p25 p75)
python
# Pandas
df.groupby('gender')['income'].agg([
    'mean', 'std', 'median',
    ('p25', lambda x: x.quantile(0.25)),
    ('p75', lambda x: x.quantile(0.75))
])

bysort

stata
* Stata
bysort gender: summarize income
python
# Pandas
df.groupby('gender')['income'].describe()

Group Analysis

python
# Group by single variable
df.groupby('gender')['income'].agg(['count', 'mean', 'std'])

# Group by multiple variables
df.groupby(['gender', 'education'])['income'].mean()

# Multiple columns, multiple statistics
df.groupby('gender').agg({
    'income': ['mean', 'median', 'std'],
    'age': ['mean', 'min', 'max']
})

Practical Example

Example: Complete Descriptive Statistics Report

python
def generate_report(df):
    print("=" * 60)
    print("Data Analysis Report")
    print("=" * 60)

    # Sample size
    print(f"\nSample size: {len(df)}")

    # Descriptive statistics
    print("\n[Descriptive Statistics]")
    print(df[['age', 'income']].describe())

    # Group by gender
    print("\n[Statistics by Gender]")
    gender_stats = df.groupby('gender').agg({
        'income': ['count', 'mean', 'std'],
        'age': 'mean'
    })
    print(gender_stats)

    # Correlation coefficient
    print("\n[Correlation Coefficient]")
    print(df[['age', 'income']].corr())

generate_report(df)

Next Steps

Next section: Web Data Scraping

Keep going!

Released under the MIT License. Content © Author.