import pandas as pd import os import random from faker import Faker def generate_advanced_data(n=1000): """ Generate synthetic patient data for hospital readmission prediction. Includes clinical and social features commonly used in care management models. """ fake = Faker() diagnoses = ['Diabetes', 'CHF', 'COPD', 'CKD', 'Depression'] data = [] for i in range(n): chronic = random.randint(0, 5) er_visits = random.randint(0, 4) adherence = random.choice(['Low', 'Medium', 'High']) prev_readmits = random.randint(0, 3) age = random.randint(45, 90) risk_score = ( chronic * 1.2 + er_visits + (0 if adherence == 'High' else 1) + prev_readmits + (1 if random.random() > 0.5 else 0) ) readmitted = 1 if risk_score > 5 else 0 record = { "Patient_ID": f"P{i+1:04}", "Age": age, "Gender": random.choice(["M", "F"]), "Chronic_Conditions": chronic, "Primary_Diagnosis": random.choice(diagnoses), "Num_ER_Visits": er_visits, "Last_Discharge_Days_Ago": random.randint(1, 60), "Previous_Readmissions": prev_readmits, "FollowUp_Scheduled": random.choice(["Yes", "No"]), "Medication_Adherence": adherence, "Language_Barrier": random.choice(["Yes", "No"]), "Housing_Instability": random.choice(["Yes", "No"]), "Caregiver_Support": random.choice(["Yes", "No"]), "Readmitted_30_Days": readmitted } data.append(record) return pd.DataFrame(data) def load_or_generate_data(path="data/patients.csv", n=1000): """ Load existing patient data if available, otherwise generate synthetic data. """ if os.path.exists(path): return pd.read_csv(path) df = generate_advanced_data(n=n) os.makedirs(os.path.dirname(path), exist_ok=True) df.to_csv(path, index=False) return df