Skip to content

Stata Data File Reading and Writing

Seamless Integration Between Python and Stata


Why Do We Need to Read and Write Stata Files?

  • Current situation: Many social science datasets are stored in .dta format
  • Need: Use Stata data in Python
  • Advantage: Preserve metadata such as variable labels and value labels

Reading and Writing Stata Files with Pandas

Install Dependencies

bash
pip install pandas
# Pandas has built-in support for .dta files

Reading Stata Files

python
import pandas as pd

# Basic reading
df = pd.read_stata('survey_data.dta')
print(df.head())

# View variable labels
print(df.columns)

# Preserve value labels (e.g., 1='Male', 2='Female')
df = pd.read_stata('survey_data.dta', convert_categoricals=True)

Writing Stata Files

python
import pandas as pd

df = pd.DataFrame({
    'respondent_id': [1, 2, 3],
    'age': [25, 30, 35],
    'income': [50000, 75000, 85000],
    'gender': ['Male', 'Female', 'Male']
})

# Save as Stata 13 format
df.to_stata('output.dta', write_index=False, version=117)

# Stata version mapping:
# 117 = Stata 13/14
# 118 = Stata 15/16
# 119 = Stata 17

Handling Variable Labels and Value Labels

Preserve Metadata When Reading

python
import pandas as pd

# Read and preserve categorical variables
df = pd.read_stata(
    'survey.dta',
    convert_categoricals=True,  # Preserve value labels
    preserve_dtypes=True        # Preserve data types
)

# View categorical variable labels
if df['gender'].dtype.name == 'category':
    print(df['gender'].cat.categories)

Add Labels When Writing

python
import pandas as pd

df = pd.DataFrame({
    'id': [1, 2, 3],
    'gender': pd.Categorical(['Male', 'Female', 'Male']),
    'education': pd.Categorical(['High School', 'Bachelor', 'Master'])
})

# Add variable labels
variable_labels = {
    'id': 'Respondent ID',
    'gender': 'Gender',
    'education': 'Education Level'
}

df.to_stata(
    'output.dta',
    write_index=False,
    variable_labels=variable_labels
)

Practical Cases

Case 1: Stata to Python Data Flow

python
import pandas as pd
import numpy as np

# 1. Read Stata data
df = pd.read_stata('raw_survey.dta')
print(f"Original data: {len(df)} rows")

# 2. Data cleaning (Python)
df_clean = df[
    (df['age'] >= 18) &
    (df['age'] <= 100) &
    (df['income'] > 0)
].copy()

# 3. Generate new variables
df_clean['log_income'] = np.log(df_clean['income'])
df_clean['age_squared'] = df_clean['age'] ** 2

# 4. Save back to Stata format
df_clean.to_stata('clean_survey.dta', write_index=False)
print(f"After cleaning: {len(df_clean)} rows")

Case 2: Batch Process Multiple Stata Files

python
import pandas as pd
from pathlib import Path

# Read data from multiple years
years = [2020, 2021, 2022, 2023]
all_data = []

for year in years:
    file_path = f'survey_{year}.dta'
    if Path(file_path).exists():
        df = pd.read_stata(file_path)
        df['year'] = year  # Add year identifier
        all_data.append(df)
        print(f"{year}: {len(df)} rows")

# Merge
combined_df = pd.concat(all_data, ignore_index=True)
combined_df.to_stata('panel_data.dta', write_index=False)

print(f"Total: {len(combined_df)} rows")

Case 3: Stata and Python Round-trip

python
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read from Stata
df = pd.read_stata('original.dta')

# Python data processing
scaler = StandardScaler()
df['income_std'] = scaler.fit_transform(df[['income']])
df['age_std'] = scaler.fit_transform(df[['age']])

# Save back to Stata (with labels)
variable_labels = {
    'income_std': 'Standardized Income',
    'age_std': 'Standardized Age'
}

df.to_stata(
    'processed.dta',
    write_index=False,
    variable_labels=variable_labels,
    version=117
)

Python vs Stata Data Operations Comparison

Reading Data

stata
* Stata
use "survey_data.dta", clear
python
# Python
import pandas as pd
df = pd.read_stata('survey_data.dta')

Filtering Data

stata
* Stata
keep if age >= 18 & age <= 65
keep if income > 0
python
# Python
df = df[(df['age'] >= 18) & (df['age'] <= 65)]
df = df[df['income'] > 0]

Generating New Variables

stata
* Stata
gen log_income = log(income)
gen age_squared = age^2
python
# Python
import numpy as np
df['log_income'] = np.log(df['income'])
df['age_squared'] = df['age'] ** 2

Saving Data

stata
* Stata
save "output.dta", replace
python
# Python
df.to_stata('output.dta', write_index=False)

Best Practices

1. Choose the Appropriate Stata Version

python
# Stata 13/14 (most compatible)
df.to_stata('output.dta', version=117)

# Stata 17 (latest features)
df.to_stata('output.dta', version=119)

2. Handle Large Files

python
# Read large Stata file in chunks
import pandas as pd

chunks = pd.read_stata('large_file.dta', chunksize=10000)
results = []

for chunk in chunks:
    # Process each chunk
    processed = chunk[chunk['age'] > 18]
    results.append(processed)

df = pd.concat(results, ignore_index=True)

3. Preserve Data Types

python
# Ensure dates, categories, etc. are converted correctly
df = pd.read_stata(
    'data.dta',
    convert_dates=True,      # Convert dates
    convert_categoricals=True  # Preserve categories
)

Python-Stata Workflows

Workflow 1: Stata Preprocessing → Python Analysis

python
# 1. Data cleaning completed in Stata (.do file)
# 2. Read and analyze in Python
import pandas as pd
from sklearn.linear_model import LinearRegression

df = pd.read_stata('clean_data.dta')

# Machine learning (Stata's weakness)
X = df[['age', 'education_years']]
y = df['income']

model = LinearRegression()
model.fit(X, y)

Workflow 2: Python Processing → Stata Regression

python
# 1. Feature engineering in Python
import pandas as pd
import numpy as np

df = pd.read_stata('raw.dta')
df['log_income'] = np.log(df['income'])
df['age_squared'] = df['age'] ** 2

# 2. Save for Stata
df.to_stata('for_regression.dta', write_index=False)

# 3. Run regression in Stata
# regress log_income age age_squared education

Practice Exercises

python
# Exercise 1: Format Conversion
# Read survey.dta
# Add new column 'income_category' (low/medium/high)
# Save as new .dta file, preserve variable labels

# Exercise 2: Batch Processing
# Read all .dta files in folder
# Merge into one large dataset
# Add source filename as new column
# Save as combined.dta

Next Steps

Next section: JSON Data Processing.

Keep going!

Released under the MIT License. Content © Author.