|
import pandas as pd
|
|
import os
|
|
import random
|
|
from faker import Faker
|
|
|
|
def generate_advanced_data(n=1000):
|
|
"""
|
|
Generate synthetic patient data for hospital readmission prediction.
|
|
Includes clinical and social features commonly used in care management models.
|
|
"""
|
|
fake = Faker()
|
|
diagnoses = ['Diabetes', 'CHF', 'COPD', 'CKD', 'Depression']
|
|
data = []
|
|
|
|
for i in range(n):
|
|
chronic = random.randint(0, 5)
|
|
er_visits = random.randint(0, 4)
|
|
adherence = random.choice(['Low', 'Medium', 'High'])
|
|
prev_readmits = random.randint(0, 3)
|
|
age = random.randint(45, 90)
|
|
|
|
risk_score = (
|
|
chronic * 1.2 +
|
|
er_visits +
|
|
(0 if adherence == 'High' else 1) +
|
|
prev_readmits +
|
|
(1 if random.random() > 0.5 else 0)
|
|
)
|
|
readmitted = 1 if risk_score > 5 else 0
|
|
|
|
record = {
|
|
"Patient_ID": f"P{i+1:04}",
|
|
"Age": age,
|
|
"Gender": random.choice(["M", "F"]),
|
|
"Chronic_Conditions": chronic,
|
|
"Primary_Diagnosis": random.choice(diagnoses),
|
|
"Num_ER_Visits": er_visits,
|
|
"Last_Discharge_Days_Ago": random.randint(1, 60),
|
|
"Previous_Readmissions": prev_readmits,
|
|
"FollowUp_Scheduled": random.choice(["Yes", "No"]),
|
|
"Medication_Adherence": adherence,
|
|
"Language_Barrier": random.choice(["Yes", "No"]),
|
|
"Housing_Instability": random.choice(["Yes", "No"]),
|
|
"Caregiver_Support": random.choice(["Yes", "No"]),
|
|
"Readmitted_30_Days": readmitted
|
|
}
|
|
|
|
data.append(record)
|
|
|
|
return pd.DataFrame(data)
|
|
|
|
def load_or_generate_data(path="data/patients.csv", n=1000):
|
|
"""
|
|
Load existing patient data if available, otherwise generate synthetic data.
|
|
"""
|
|
if os.path.exists(path):
|
|
return pd.read_csv(path)
|
|
|
|
df = generate_advanced_data(n=n)
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
df.to_csv(path, index=False)
|
|
return df
|
|
|