CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on 26 days ago

Commit

a8c9181

verified ·

1 Parent(s): a135a34

Update app.py

Browse files

Files changed (1) hide show

app.py +401 -28

app.py CHANGED Viewed

@@ -6,16 +6,20 @@ import re
 import gc
 import time
 from datetime import datetime
-from typing import List, Tuple, Dict, Union
 import pandas as pd
 import pdfplumber
-import gradio as gr
 import torch
 import matplotlib.pyplot as plt
 from fpdf import FPDF
 import unicodedata
 # === Configuration ===
 persistent_dir = "/data/hf_cache"
 model_cache_dir = os.path.join(persistent_dir, "txagent_models")
 tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
@@ -41,11 +45,45 @@ BATCH_SIZE = 1
 PROMPT_OVERHEAD = 300
 SAFE_SLEEP = 0.5
 def estimate_tokens(text: str) -> int:
     return len(text) // 4 + 1
 def clean_response(text: str) -> str:
-    text = re.sub(r"\[.*?\]|\bNone\b", "", text, flags=re.DOTALL)
     text = re.sub(r"\n{3,}", "\n\n", text)
     return text.strip()
@@ -60,29 +98,364 @@ def remove_duplicate_paragraphs(text: str) -> str:
             seen.add(clean_p)
     return "\n\n".join(unique_paragraphs)
-# === FastAPI for mobile API endpoint ===
-from fastapi import FastAPI, UploadFile, File
-from fastapi.responses import JSONResponse
-import uvicorn
-app = FastAPI()
-@app.post("/analyze")
-async def analyze_file_api(file: UploadFile = File(...)):
-    agent = init_agent()
-    temp_file_path = os.path.join(file_cache_dir, file.filename)
-    with open(temp_file_path, "wb") as f:
-        f.write(await file.read())
-    messages = []
-    messages, pdf_path = process_report(agent, open(temp_file_path, "rb"), messages)
-    if pdf_path:
-        return JSONResponse(content={"summary": messages[-2]['content'], "pdf": pdf_path})
-    return JSONResponse(content={"error": "Processing failed."}, status_code=400)
-# === Original Gradio UI launch preserved ===
 if __name__ == "__main__":
-    agent = init_agent()
-    ui = create_ui(agent)
-    import threading
-    threading.Thread(target=lambda: ui.launch(server_name="0.0.0.0", server_port=7860, allowed_paths=["/data/hf_cache/reports"], share=False)).start()
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import gc
 import time
 from datetime import datetime
+from typing import List, Tuple, Dict, Union, Optional
 import pandas as pd
 import pdfplumber
 import torch
 import matplotlib.pyplot as plt
 from fpdf import FPDF
 import unicodedata
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 # === Configuration ===
 persistent_dir = "/data/hf_cache"
 model_cache_dir = os.path.join(persistent_dir, "txagent_models")
 tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
 PROMPT_OVERHEAD = 300
 SAFE_SLEEP = 0.5
+# === FastAPI App Setup ===
+app = FastAPI(title="Clinical Patient Support System API",
+              description="API for analyzing and summarizing unstructured medical files")
+# CORS configuration for mobile app access
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# === Data Models ===
+class AnalysisRequest(BaseModel):
+    """Request model for file analysis"""
+    filename: str
+    file_content: str  # Base64 encoded file content (mobile apps can send this)
+class AnalysisResponse(BaseModel):
+    """Response model for analysis results"""
+    status: str
+    message: str
+    report_id: Optional[str] = None
+    summary: Optional[str] = None
+    error: Optional[str] = None
+class ReportResponse(BaseModel):
+    """Response model for report download"""
+    status: str
+    report_id: str
+    download_url: str
+# === Helper Functions (same as original) ===
 def estimate_tokens(text: str) -> int:
     return len(text) // 4 + 1
 def clean_response(text: str) -> str:
+    text = re.sub(r"$.*?$|\bNone\b", "", text, flags=re.DOTALL)
     text = re.sub(r"\n{3,}", "\n\n", text)
     return text.strip()
             seen.add(clean_p)
     return "\n\n".join(unique_paragraphs)
+def extract_text_from_excel(path: str) -> str:
+    all_text = []
+    xls = pd.ExcelFile(path)
+    for sheet_name in xls.sheet_names:
+        try:
+            df = xls.parse(sheet_name).astype(str).fillna("")
+        except Exception:
+            continue
+        for _, row in df.iterrows():
+            non_empty = [cell.strip() for cell in row if cell.strip()]
+            if len(non_empty) >= 2:
+                text_line = " | ".join(non_empty)
+                if len(text_line) > 15:
+                    all_text.append(f"[{sheet_name}] {text_line}")
+    return "\n".join(all_text)
+def extract_text_from_csv(path: str) -> str:
+    all_text = []
+    try:
+        df = pd.read_csv(path).astype(str).fillna("")
+    except Exception:
+        return ""
+    for _, row in df.iterrows():
+        non_empty = [cell.strip() for cell in row if cell.strip()]
+        if len(non_empty) >= 2:
+            text_line = " | ".join(non_empty)
+            if len(text_line) > 15:
+                all_text.append(text_line)
+    return "\n".join(all_text)
+def extract_text_from_pdf(path: str) -> str:
+    import logging
+    logging.getLogger("pdfminer").setLevel(logging.ERROR)
+    all_text = []
+    try:
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    all_text.append(text.strip())
+    except Exception:
+        return ""
+    return "\n".join(all_text)
+def extract_text(file_path: str) -> str:
+    if file_path.endswith(".xlsx"):
+        return extract_text_from_excel(file_path)
+    elif file_path.endswith(".csv"):
+        return extract_text_from_csv(file_path)
+    elif file_path.endswith(".pdf"):
+        return extract_text_from_pdf(file_path)
+    else:
+        return ""
+def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
+    effective_limit = max_tokens - PROMPT_OVERHEAD
+    chunks, current, current_tokens = [], [], 0
+    for line in text.split("\n"):
+        tokens = estimate_tokens(line)
+        if current_tokens + tokens > effective_limit:
+            if current:
+                chunks.append("\n".join(current))
+            current, current_tokens = [line], tokens
+        else:
+            current.append(line)
+            current_tokens += tokens
+    if current:
+        chunks.append("\n".join(current))
+    return chunks
+def batch_chunks(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[List[str]]:
+    return [chunks[i:i+batch_size] for i in range(0, len(chunks), batch_size)]
+def build_prompt(chunk: str) -> str:
+    return f"""### Unstructured Clinical Records\n\nAnalyze the clinical notes below and summarize with:\n- Diagnostic Patterns\n- Medication Issues\n- Missed Opportunities\n- Inconsistencies\n- Follow-up Recommendations\n\n---\n\n{chunk}\n\n---\nRespond concisely in bullet points with clinical reasoning."""
+def init_agent() -> TxAgent:
+    tool_path = os.path.join(tool_cache_dir, "new_tool.json")
+    if not os.path.exists(tool_path):
+        shutil.copy(os.path.abspath("data/new_tool.json"), tool_path)
+    agent = TxAgent(
+        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
+        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
+        tool_files_dict={"new_tool": tool_path},
+        force_finish=True,
+        enable_checker=True,
+        step_rag_num=4,
+        seed=100
+    )
+    agent.init_model()
+    return agent
+def analyze_batches(agent, batches: List[List[str]]) -> List[str]:
+    results = []
+    for batch in batches:
+        prompt = "\n\n".join(build_prompt(chunk) for chunk in batch)
+        try:
+            batch_response = ""
+            for r in agent.run_gradio_chat(
+                message=prompt,
+                history=[],
+                temperature=0.0,
+                max_new_tokens=MAX_NEW_TOKENS,
+                max_token=MAX_MODEL_TOKENS,
+                call_agent=False,
+                conversation=[]
+            ):
+                if isinstance(r, str):
+                    batch_response += r
+                elif isinstance(r, list):
+                    for m in r:
+                        if hasattr(m, "content"):
+                            batch_response += m.content
+                elif hasattr(r, "content"):
+                    batch_response += r.content
+            results.append(clean_response(batch_response))
+            time.sleep(SAFE_SLEEP)
+        except Exception as e:
+            results.append(f"❌ Batch failed: {str(e)}")
+            time.sleep(SAFE_SLEEP * 2)
+        torch.cuda.empty_cache()
+        gc.collect()
+    return results
+def generate_final_summary(agent, combined: str) -> str:
+    combined = remove_duplicate_paragraphs(combined)
+    final_prompt = f"""
+    You are an expert clinical summarizer. Analyze the following summaries carefully and generate a **single final concise structured medical report**, avoiding any repetition or redundancy.
+    Summaries:
+    {combined}
+    Respond with:
+    * Diagnostic Patterns
+    * Medication Issues
+    * Missed Opportunities
+    * Inconsistencies
+    * Follow-up Recommendations
+    Avoid repeating the same points multiple times.
+    """.strip()
+    final_response = ""
+    for r in agent.run_gradio_chat(
+        message=final_prompt,
+        history=[],
+        temperature=0.0,
+        max_new_tokens=MAX_NEW_TOKENS,
+        max_token=MAX_MODEL_TOKENS,
+        call_agent=False,
+        conversation=[]
+    ):
+        if isinstance(r, str):
+            final_response += r
+        elif isinstance(r, list):
+            for m in r:
+                if hasattr(m, "content"):
+                    final_response += m.content
+        elif hasattr(r, "content"):
+            final_response += r.content
+    final_response = clean_response(final_response)
+    final_response = remove_duplicate_paragraphs(final_response)
+    return final_response
+def remove_non_ascii(text):
+    return ''.join(c for c in text if ord(c) < 256)
+def generate_pdf_report_with_charts(summary: str, report_path: str, detailed_batches: List[str] = None):
+    chart_dir = os.path.join(os.path.dirname(report_path), "charts")
+    os.makedirs(chart_dir, exist_ok=True)
+    # Prepare data
+    categories = ['Diagnostics', 'Medications', 'Missed', 'Inconsistencies', 'Follow-up']
+    values = [4, 2, 3, 1, 5]
+    # Chart 1: Bar
+    bar_chart_path = os.path.join(chart_dir, "bar_chart.png")
+    plt.figure(figsize=(6, 4))
+    plt.bar(categories, values)
+    plt.title('Clinical Issues Overview')
+    plt.tight_layout()
+    plt.savefig(bar_chart_path)
+    plt.close()
+    # Chart 2: Pie
+    pie_chart_path = os.path.join(chart_dir, "pie_chart.png")
+    plt.figure(figsize=(6, 6))
+    plt.pie(values, labels=categories, autopct='%1.1f%%')
+    plt.title('Issue Distribution')
+    plt.tight_layout()
+    plt.savefig(pie_chart_path)
+    plt.close()
+    # Chart 3: Line
+    trend_chart_path = os.path.join(chart_dir, "trend_chart.png")
+    plt.figure(figsize=(6, 4))
+    plt.plot(categories, values, marker='o')
+    plt.title('Trend Analysis')
+    plt.tight_layout()
+    plt.savefig(trend_chart_path)
+    plt.close()
+    # PDF init
+    pdf_path = report_path.replace('.md', '.pdf')
+    pdf = FPDF()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    # === Title Page ===
+    pdf.add_page()
+    pdf.set_font("Arial", 'B', 24)
+    pdf.cell(0, 20, remove_non_ascii("Final Medical Report"), ln=True, align='C')
+    pdf.set_font("Arial", '', 14)
+    pdf.cell(0, 10, datetime.now().strftime("Generated on %B %d, %Y at %H:%M"), ln=True, align='C')
+    pdf.ln(20)
+    pdf.set_font("Arial", 'I', 12)
+    pdf.multi_cell(0, 10, remove_non_ascii(
+        "This report contains a professional summary of clinical observations, potential inconsistencies, and follow-up recommendations based on the uploaded medical document."
+    ), align="C")
+    # === Summary Section ===
+    pdf.add_page()
+    pdf.set_font("Arial", 'B', 16)
+    pdf.cell(0, 10, remove_non_ascii("Final Summary"), ln=True)
+    pdf.set_draw_color(200, 200, 200)
+    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
+    pdf.ln(5)
+    pdf.set_font("Arial", '', 12)
+    for line in summary.split("\n"):
+        clean_line = remove_non_ascii(line.strip())
+        if clean_line:
+            pdf.multi_cell(0, 8, txt=clean_line)
+    # === Charts Section ===
+    pdf.add_page()
+    pdf.set_font("Arial", 'B', 16)
+    pdf.cell(0, 10, remove_non_ascii("Statistical Overview"), ln=True)
+    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
+    pdf.ln(5)
+    pdf.set_font("Arial", 'B', 12)
+    pdf.cell(0, 10, remove_non_ascii("1. Clinical Issues Overview"), ln=True)
+    pdf.image(bar_chart_path, w=180)
+    pdf.ln(5)
+    pdf.cell(0, 10, remove_non_ascii("2. Issue Distribution"), ln=True)
+    pdf.image(pie_chart_path, w=150)
+    pdf.ln(5)
+    pdf.cell(0, 10, remove_non_ascii("3. Trend Analysis"), ln=True)
+    pdf.image(trend_chart_path, w=180)
+    # === Detailed Tool Outputs ===
+    if detailed_batches:
+        pdf.add_page()
+        pdf.set_font("Arial", 'B', 16)
+        pdf.cell(0, 10, remove_non_ascii("Detailed Tool Insights"), ln=True)
+        pdf.line(10, pdf.get_y(), 200, pdf.get_y())
+        pdf.ln(5)
+        for idx, detail in enumerate(detailed_batches):
+            pdf.set_font("Arial", 'B', 13)
+            pdf.cell(0, 10, remove_non_ascii(f"Tool Output #{idx + 1}"), ln=True)
+            pdf.set_font("Arial", '', 11)
+            for line in remove_non_ascii(detail).split("\n"):
+                pdf.multi_cell(0, 8, txt=line.strip())
+            pdf.ln(3)
+    pdf.output(pdf_path)
+    return pdf_path
+# === API Endpoints ===
+@app.post("/analyze", response_model=AnalysisResponse)
+async def analyze_file(file: UploadFile = File(...)):
+    """Endpoint for analyzing medical files"""
+    try:
+        start_time = time.time()
+        # Save the uploaded file temporarily
+        temp_path = os.path.join(file_cache_dir, file.filename)
+        with open(temp_path, "wb") as f:
+            f.write(await file.read())
+        # Generate a unique report ID
+        report_id = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        # Initialize agent (could be done once at startup)
+        agent = init_agent()
+        # Process the file
+        extracted = extract_text(temp_path)
+        if not extracted:
+            raise HTTPException(status_code=400, detail="Could not extract text from file")
+        chunks = split_text(extracted)
+        batches = batch_chunks(chunks, batch_size=BATCH_SIZE)
+        batch_results = analyze_batches(agent, batches)
+        all_tool_outputs = batch_results.copy()
+        valid = [res for res in batch_results if not res.startswith("❌")]
+        if not valid:
+            raise HTTPException(status_code=400, detail="No valid batch outputs generated")
+        summary = generate_final_summary(agent, "\n\n".join(valid))
+        # Save report files
+        report_path = os.path.join(report_dir, f"{report_id}.md")
+        with open(report_path, 'w', encoding='utf-8') as f:
+            f.write(f"# Final Medical Report\n\n{summary}")
+        pdf_path = generate_pdf_report_with_charts(summary, report_path, detailed_batches=all_tool_outputs)
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        # Clean up temp file
+        os.remove(temp_path)
+        return {
+            "status": "success",
+            "message": f"Report generated in {elapsed_time:.2f} seconds",
+            "report_id": report_id,
+            "summary": summary
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/report/{report_id}", response_model=ReportResponse)
+async def get_report(report_id: str):
+    """Endpoint for downloading generated reports"""
+    pdf_path = os.path.join(report_dir, f"{report_id}.pdf")
+    if not os.path.exists(pdf_path):
+        raise HTTPException(status_code=404, detail="Report not found")
+    return {
+        "status": "success",
+        "report_id": report_id,
+        "download_url": f"/download/{report_id}"
+    }
+@app.get("/download/{report_id}")
+async def download_report(report_id: str):
+    """Endpoint for actual file download"""
+    pdf_path = os.path.join(report_dir, f"{report_id}.pdf")
+    if not os.path.exists(pdf_path):
+        raise HTTPException(status_code=404, detail="Report not found")
+    return FileResponse(
+        pdf_path,
+        media_type="application/pdf",
+        filename=f"medical_report_{report_id}.pdf"
+    )
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+# === Main Application ===
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)