Skip to content

6.5 Distribution Comparison

"Data visualization is at the intersection of art and science."— Alberto Cairo, Data Visualization Expert

Visual comparison of multiple data groups

DifficultyImportance


Section Objectives

After completing this section, you will be able to:

  • Compare distributions across multiple groups (overlapping density plots, ECDF)
  • Use grouped box plots and violin plots
  • Create Ridgeline plots
  • Choose appropriate distribution comparison methods

Overlapping Density Plots

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Generate multiple groups of data
np.random.seed(42)
n = 500

regions = []
wages = []
for region, mean_effect in [('East', 0.2), ('Central', 0.1), ('West', 0), ('Northeast', -0.1)]:
    education = np.random.normal(13, 3, n)
    log_wage = 1.5 + 0.08*education + mean_effect + np.random.normal(0, 0.3, n)
    wage = np.exp(log_wage)

    regions.extend([region] * n)
    wages.extend(wage)

df = pd.DataFrame({'region': regions, 'wage': wages})

# Overlapping density plots
plt.figure(figsize=(12, 6))

for region, color in [('East', '#1f77b4'), ('Central', '#ff7f0e'),
                      ('West', '#2ca02c'), ('Northeast', '#d62728')]:
    subset = df[df['region'] == region]['wage']
    sns.kdeplot(subset, label=region, linewidth=2, color=color, fill=True, alpha=0.3)

plt.xlabel('Wage (thousand yuan/month)', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.title('Wage Distribution Comparison Across Regions', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

ECDF Comparison

python
plt.figure(figsize=(12, 6))

for region, color in [('East', '#1f77b4'), ('Central', '#ff7f0e'),
                      ('West', '#2ca02c'), ('Northeast', '#d62728')]:
    subset = df[df['region'] == region]['wage']
    sns.ecdfplot(subset, label=region, linewidth=2, color=color)

plt.xlabel('Wage (thousand yuan/month)', fontsize=12)
plt.ylabel('Cumulative Probability', fontsize=12)
plt.title('Regional Wage Cumulative Distribution Comparison', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate median for each region
print("\nRegional Wage Medians:")
for region in ['East', 'Central', 'West', 'Northeast']:
    median = df[df['region'] == region]['wage'].median()
    print(f"  {region}: {median:.2f} thousand yuan")

Grouped Box Plots and Violin Plots

python
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Box plot
sns.boxplot(x='region', y='wage', data=df, ax=axes[0],
           order=['East', 'Central', 'West', 'Northeast'])
axes[0].set_title('Grouped Box Plot', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Region')
axes[0].set_ylabel('Wage (thousand yuan/month)')
axes[0].grid(True, alpha=0.3, axis='y')

# Violin plot
sns.violinplot(x='region', y='wage', data=df, ax=axes[1],
              order=['East', 'Central', 'West', 'Northeast'])
axes[1].set_title('Grouped Violin Plot', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Region')
axes[1].set_ylabel('Wage (thousand yuan/month)')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Ridgeline Plot

python
# Create Ridgeline-style plot using seaborn's FacetGrid
from matplotlib.collections import PolyCollection

fig, axes = plt.subplots(4, 1, figsize=(12, 8), sharex=True)

regions_ordered = ['East', 'Central', 'West', 'Northeast']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

for i, (region, color) in enumerate(zip(regions_ordered, colors)):
    subset = df[df['region'] == region]['wage']

    # KDE
    axes[i].fill_between(np.linspace(subset.min(), subset.max(), 100),
                        0,
                        [0]*100,  # Simplified here, should use KDE values
                        alpha=0.6, color=color)
    axes[i].hist(subset, bins=30, density=True, alpha=0.6, color=color, edgecolor='black')
    axes[i].set_ylabel(region, fontsize=12, rotation=0, labelpad=40, va='center')
    axes[i].set_ylim(0, None)
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
    axes[i].spines['left'].set_visible(False)
    axes[i].set_yticks([])

    if i < 3:
        axes[i].spines['bottom'].set_visible(False)
        axes[i].set_xticks([])

axes[-1].set_xlabel('Wage (thousand yuan/month)', fontsize=12)
plt.suptitle('Regional Wage Distribution (Ridgeline Style)', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

Section Summary

Distribution Comparison Method Selection

PurposeRecommended ChartAdvantages
Compare shapesOverlapping density plotsIntuitively display distribution shapes
Compare quantilesECDFPrecisely compare any quantiles
Identify outliersGrouped box plotsFive-number summary + outliers
Compare detailsGrouped violin plotsDensity + box plot
Multiple groups verticallyRidgeline plotSpace-saving, easy comparison

Next Section Preview

In the next section, we will learn how to create publication-quality figures meeting academic standards.

Master professional chart creation!

Released under the MIT License. Content © Author.