Modules and Package Management
Organizing Code, Reusing Functionality — Building Maintainable Projects
What is a Module?
Module = A .py file Package = A folder containing __init__.py
Analogy:
- Stata:
.adofiles - R: Packages (like
dplyr,ggplot2)
Importing Modules
1. Import Entire Module
python
import math
print(math.sqrt(16)) # 4.0
print(math.pi) # 3.141592653589793
print(math.log(100)) # 4.605...2. Import with Alias
python
import pandas as pd # Standard practice
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({'x': [1, 2, 3]})3. Import Specific Functions
python
from math import sqrt, pi, log
print(sqrt(16)) # 4.0 (no need for math.)
print(pi) # 3.1415926535897934. Import All (Not Recommended)
python
from math import * # ❌ Not recommended, may overwrite existing variables
# ✅ Recommended: Explicit imports
from math import sqrt, pi, logCommon Standard Libraries
1. math: Mathematical Operations
python
import math
# Basic functions
math.sqrt(16) # Square root
math.log(100) # Natural logarithm
math.log10(100) # Base-10 logarithm
math.exp(2) # e^2
math.pow(2, 3) # 2^3
# Trigonometric functions
math.sin(math.pi / 2) # 1.0
math.cos(0) # 1.0
# Constants
math.pi # 3.14159...
math.e # 2.71828...2. statistics: Statistics
python
import statistics as stats
data = [85, 92, 78, 90, 88]
stats.mean(data) # Mean
stats.median(data) # Median
stats.mode(data) # Mode
stats.stdev(data) # Standard deviation
stats.variance(data) # Variance3. random: Random Numbers
python
import random
# Random integer
random.randint(1, 10) # Random integer from 1-10
# Random float
random.random() # Between 0-1
random.uniform(1, 10) # Between 1-10
# Random selection
majors = ['Economics', 'Sociology', 'Political Science']
random.choice(majors) # Randomly pick one
# Random sampling
random.sample(majors, 2) # Sample 2 without replacement
# Shuffle list
data = [1, 2, 3, 4, 5]
random.shuffle(data) # Shuffle in placeSocial Science Application: Random Assignment
python
import random
participants = list(range(1, 101)) # 100 participants
random.shuffle(participants)
# Divide into treatment and control groups
treatment = participants[:50]
control = participants[50:]
print(f"Treatment group: {len(treatment)} people")
print(f"Control group: {len(control)} people")4. datetime: Date and Time
python
from datetime import datetime, timedelta
# Current time
now = datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S"))
# Date arithmetic
survey_start = datetime(2024, 1, 1)
survey_end = survey_start + timedelta(days=365)
duration = survey_end - survey_start
print(f"Survey period: {duration.days} days")
# Parse date
date_str = "2024-03-15"
date_obj = datetime.strptime(date_str, "%Y-%m-%d")5. json: JSON Data
python
import json
# Dictionary to JSON
data = {
'respondent_id': 1001,
'age': 30,
'income': 75000
}
json_str = json.dumps(data, indent=2)
print(json_str)
# JSON to dictionary
parsed = json.loads(json_str)
print(parsed['age']) # 30Creating Your Own Modules
Example: Create survey_utils.py
python
# survey_utils.py
"""Survey data processing utility module"""
def calculate_response_rate(collected, target):
"""Calculate response rate"""
return (collected / target) * 100
def validate_age(age):
"""Validate age"""
return 0 < age < 120
def income_to_category(income):
"""Categorize income"""
if income < 50000:
return "Low income"
elif income < 100000:
return "Middle income"
else:
return "High income"
# Module-level constants
VALID_GENDERS = ["Male", "Female", "Other"]
MIN_AGE = 18
MAX_AGE = 100Using Custom Module
python
# main.py
import survey_utils
# Use functions
rate = survey_utils.calculate_response_rate(850, 1000)
print(f"Response rate: {rate:.1f}%")
is_valid = survey_utils.validate_age(25)
category = survey_utils.income_to_category(75000)
# Use constants
print(survey_utils.VALID_GENDERS)Package Structure
my_project/
├── data_processing/
│ ├── __init__.py
│ ├── cleaning.py
│ └── validation.py
├── analysis/
│ ├── __init__.py
│ ├── descriptive.py
│ └── regression.py
└── main.pyImporting Modules from Packages
python
# main.py
from data_processing import cleaning
from data_processing.validation import validate_age
from analysis.descriptive import calculate_mean
# Usage
cleaned_data = cleaning.remove_outliers(raw_data)
is_valid = validate_age(25)
mean = calculate_mean([1, 2, 3, 4, 5])Third-Party Package Management
pip: Package Manager
bash
# Install package
pip install pandas
pip install numpy matplotlib
# Install specific version
pip install pandas==1.5.3
# Batch install
pip install -r requirements.txt
# View installed packages
pip list
# Uninstall
pip uninstall pandasrequirements.txt
pandas==1.5.3
numpy==1.24.3
matplotlib==3.7.1
scikit-learn==1.2.2
statsmodels==0.14.0Virtual Environments
bash
# Create virtual environment
python -m venv myenv
# Activate (Mac/Linux)
source myenv/bin/activate
# Activate (Windows)
myenv\Scripts\activate
# Install packages (in virtual environment)
pip install pandas numpy
# Export dependencies
pip freeze > requirements.txt
# Deactivate
deactivateReal-World: Project Structure
income_survey_project/
├── data/
│ ├── raw/
│ │ └── survey_responses.csv
│ └── processed/
│ └── clean_data.csv
├── src/
│ ├── __init__.py
│ ├── data_cleaning.py
│ ├── statistical_analysis.py
│ └── visualization.py
├── tests/
│ ├── test_cleaning.py
│ └── test_analysis.py
├── notebooks/
│ ├── exploratory_analysis.ipynb
│ └── final_report.ipynb
├── outputs/
│ ├── figures/
│ └── tables/
├── main.py
├── requirements.txt
└── README.mdsrc/data_cleaning.py
python
"""Data cleaning module"""
import pandas as pd
def remove_outliers(df, column, lower=0.01, upper=0.99):
"""Remove outliers"""
q_low = df[column].quantile(lower)
q_high = df[column].quantile(upper)
return df[(df[column] >= q_low) & (df[column] <= q_high)]
def validate_responses(df):
"""Validate response data"""
valid_mask = (
(df['age'] >= 18) &
(df['age'] <= 100) &
(df['income'] >= 0)
)
return df[valid_mask]main.py
python
"""Main program"""
import pandas as pd
from src.data_cleaning import remove_outliers, validate_responses
from src.statistical_analysis import calculate_descriptive_stats
from src.visualization import plot_income_distribution
# Read data
df = pd.read_csv('data/raw/survey_responses.csv')
# Clean
df = validate_responses(df)
df = remove_outliers(df, 'income')
# Analyze
stats = calculate_descriptive_stats(df)
print(stats)
# Visualize
plot_income_distribution(df, save_path='outputs/figures/income_dist.png')Best Practices
1. Import Order
python
# 1. Standard library
import os
import sys
from datetime import datetime
# 2. Third-party libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 3. Your own modules
from src.data_cleaning import remove_outliers
from src.analysis import calculate_mean2. Avoid Circular Imports
python
# ❌ Error: A imports B, B imports A
# module_a.py
from module_b import func_b
# module_b.py
from module_a import func_a # Circular import!
# ✅ Solution: Extract common dependencies into a third module3. if __name__ == "__main__"
python
# my_module.py
def calculate_mean(data):
return sum(data) / len(data)
# Module-level test code
if __name__ == "__main__":
# Only executed when run directly
test_data = [1, 2, 3, 4, 5]
print(calculate_mean(test_data))Practice Exercises
Exercise 1: Create Utility Module
python
# Create stats_utils.py module containing:
# 1. calculate_mean(data)
# 2. calculate_median(data)
# 3. calculate_variance(data)
# 4. CONFIDENCE_LEVEL = 0.95 (constant)
# Then import and use in another fileExercise 2: Data Processing Pipeline
python
# Create package structure:
# data_pipeline/
# __init__.py
# cleaning.py # Data cleaning functions
# transform.py # Data transformation functions
# export.py # Export functions
# Use complete pipeline in main.pySummary
You now master:
- ✅ Function definition and calling
- ✅ Parameter passing (positional, keyword, variable)
- ✅ Lambda functions
- ✅ Module and package management
Next Module: We'll learn about Object-Oriented Programming Basics.
Ready?