Skip to content

Data Visualization

Matplotlib and Seaborn — Python's Plotting Tools


Two Major Visualization Libraries

  • Matplotlib: Foundation plotting library (similar to Stata graph)
  • Seaborn: High-level library built on Matplotlib (similar to R's ggplot2)

Matplotlib Basics

Installation and Import

python
import matplotlib.pyplot as plt
import numpy as np

# Display plots in Jupyter
%matplotlib inline

Basic Plotting

python
# Line plot
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]

plt.plot(x, y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Simple Line Plot')
plt.show()

# Scatter plot
plt.scatter(x, y)
plt.show()

# Bar chart
plt.bar(x, y)
plt.show()

# Histogram
data = np.random.randn(1000)
plt.hist(data, bins=30, edgecolor='black')
plt.show()

Practical Examples

Example 1: Income Distribution

python
import pandas as pd
import matplotlib.pyplot as plt

# Simulated data
np.random.seed(42)
incomes = np.random.lognormal(10.8, 0.5, 1000) * 1000

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Left: Histogram
axes[0].hist(incomes, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Income ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Income Distribution')

# Right: Box plot
axes[1].boxplot(incomes)
axes[1].set_ylabel('Income ($)')
axes[1].set_title('Income Boxplot')

plt.tight_layout()
plt.show()

Example 2: Age-Income Relationship

python
# Data
ages = np.random.randint(22, 65, 200)
incomes = 20000 + ages * 1500 + np.random.normal(0, 10000, 200)

# Scatter plot + fitted line
plt.figure(figsize=(10, 6))
plt.scatter(ages, incomes, alpha=0.5)

# Add fitted line
z = np.polyfit(ages, incomes, 1)
p = np.poly1d(z)
plt.plot(ages, p(ages), "r--", linewidth=2)

plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)
plt.show()

Seaborn: More Beautiful Charts

python
import seaborn as sns
sns.set_style("whitegrid")  # Set style

# Data
df = pd.DataFrame({
    'age': np.random.randint(22, 65, 300),
    'income': np.random.normal(60000, 20000, 300),
    'gender': np.random.choice(['M', 'F'], 300),
    'education': np.random.choice(['HS', 'BA', 'MA'], 300)
})

Common Chart Types

python
# 1. Scatter plot + regression line
sns.regplot(data=df, x='age', y='income')
plt.show()

# 2. Grouped box plot
sns.boxplot(data=df, x='education', y='income', hue='gender')
plt.show()

# 3. Distribution plot
sns.histplot(data=df, x='income', hue='gender', kde=True)
plt.show()

# 4. Violin plot
sns.violinplot(data=df, x='education', y='income')
plt.show()

# 5. Pair plot (multivariate relationships)
sns.pairplot(df, hue='gender')
plt.show()

Stata Chart Comparison

Stata: Scatter Plot

stata
* Stata
twoway scatter income age

Python: Equivalent Code

python
# Matplotlib
plt.scatter(df['age'], df['income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()

# Seaborn (simpler)
sns.scatterplot(data=df, x='age', y='income')
plt.show()

Useful Techniques

Subplot Layout

python
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Top left
axes[0, 0].hist(df['age'], bins=20)
axes[0, 0].set_title('Age Distribution')

# Top right
axes[0, 1].hist(df['income'], bins=20)
axes[0, 1].set_title('Income Distribution')

# Bottom left
axes[1, 0].scatter(df['age'], df['income'])
axes[1, 0].set_title('Age vs Income')

# Bottom right
axes[1, 1].boxplot([
    df[df['education']=='HS']['income'],
    df[df['education']=='BA']['income'],
    df[df['education']=='MA']['income']
])
axes[1, 1].set_xticklabels(['HS', 'BA', 'MA'])
axes[1, 1].set_title('Income by Education')

plt.tight_layout()
plt.show()

Saving Charts

python
plt.figure(figsize=(10, 6))
plt.scatter(df['age'], df['income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')

# Save
plt.savefig('age_income.png', dpi=300, bbox_inches='tight')
plt.savefig('age_income.pdf')  # Vector format (for papers)

Practice Exercises

python
# Using the provided data, create:
# 1. Income distribution histogram (faceted by gender)
# 2. Age-income scatter plot (colored by education level)
# 3. Income box plot by education level
# 4. Save all charts

Next Steps

Next section: Descriptive Statistics and Group Analysis

Keep going!

Released under the MIT License. Content © Author.