Data Visualization
Matplotlib and Seaborn — Python's Plotting Tools
Two Major Visualization Libraries
- Matplotlib: Foundation plotting library (similar to Stata graph)
- Seaborn: High-level library built on Matplotlib (similar to R's ggplot2)
Matplotlib Basics
Installation and Import
python
import matplotlib.pyplot as plt
import numpy as np
# Display plots in Jupyter
%matplotlib inlineBasic Plotting
python
# Line plot
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]
plt.plot(x, y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Simple Line Plot')
plt.show()
# Scatter plot
plt.scatter(x, y)
plt.show()
# Bar chart
plt.bar(x, y)
plt.show()
# Histogram
data = np.random.randn(1000)
plt.hist(data, bins=30, edgecolor='black')
plt.show()Practical Examples
Example 1: Income Distribution
python
import pandas as pd
import matplotlib.pyplot as plt
# Simulated data
np.random.seed(42)
incomes = np.random.lognormal(10.8, 0.5, 1000) * 1000
# Create figure
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Left: Histogram
axes[0].hist(incomes, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Income ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Income Distribution')
# Right: Box plot
axes[1].boxplot(incomes)
axes[1].set_ylabel('Income ($)')
axes[1].set_title('Income Boxplot')
plt.tight_layout()
plt.show()Example 2: Age-Income Relationship
python
# Data
ages = np.random.randint(22, 65, 200)
incomes = 20000 + ages * 1500 + np.random.normal(0, 10000, 200)
# Scatter plot + fitted line
plt.figure(figsize=(10, 6))
plt.scatter(ages, incomes, alpha=0.5)
# Add fitted line
z = np.polyfit(ages, incomes, 1)
p = np.poly1d(z)
plt.plot(ages, p(ages), "r--", linewidth=2)
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)
plt.show()Seaborn: More Beautiful Charts
python
import seaborn as sns
sns.set_style("whitegrid") # Set style
# Data
df = pd.DataFrame({
'age': np.random.randint(22, 65, 300),
'income': np.random.normal(60000, 20000, 300),
'gender': np.random.choice(['M', 'F'], 300),
'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})Common Chart Types
python
# 1. Scatter plot + regression line
sns.regplot(data=df, x='age', y='income')
plt.show()
# 2. Grouped box plot
sns.boxplot(data=df, x='education', y='income', hue='gender')
plt.show()
# 3. Distribution plot
sns.histplot(data=df, x='income', hue='gender', kde=True)
plt.show()
# 4. Violin plot
sns.violinplot(data=df, x='education', y='income')
plt.show()
# 5. Pair plot (multivariate relationships)
sns.pairplot(df, hue='gender')
plt.show()Stata Chart Comparison
Stata: Scatter Plot
stata
* Stata
twoway scatter income agePython: Equivalent Code
python
# Matplotlib
plt.scatter(df['age'], df['income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
# Seaborn (simpler)
sns.scatterplot(data=df, x='age', y='income')
plt.show()Useful Techniques
Subplot Layout
python
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Top left
axes[0, 0].hist(df['age'], bins=20)
axes[0, 0].set_title('Age Distribution')
# Top right
axes[0, 1].hist(df['income'], bins=20)
axes[0, 1].set_title('Income Distribution')
# Bottom left
axes[1, 0].scatter(df['age'], df['income'])
axes[1, 0].set_title('Age vs Income')
# Bottom right
axes[1, 1].boxplot([
df[df['education']=='HS']['income'],
df[df['education']=='BA']['income'],
df[df['education']=='MA']['income']
])
axes[1, 1].set_xticklabels(['HS', 'BA', 'MA'])
axes[1, 1].set_title('Income by Education')
plt.tight_layout()
plt.show()Saving Charts
python
plt.figure(figsize=(10, 6))
plt.scatter(df['age'], df['income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')
# Save
plt.savefig('age_income.png', dpi=300, bbox_inches='tight')
plt.savefig('age_income.pdf') # Vector format (for papers)Practice Exercises
python
# Using the provided data, create:
# 1. Income distribution histogram (faceted by gender)
# 2. Age-income scatter plot (colored by education level)
# 3. Income box plot by education level
# 4. Save all chartsNext Steps
Next section: Descriptive Statistics and Group Analysis
Keep going!