CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 19

Commit

ba63eca

verified ·

1 Parent(s): 65e7d58

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -450

app.py CHANGED Viewed

@@ -1,309 +1,33 @@
-import sys
-import os
-import pandas as pd
-import pdfplumber
-import json
-import gradio as gr
-from typing import List, Dict, Optional, Generator, Any
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import hashlib
-import shutil
-import re
-import psutil
-import subprocess
-import logging
-import torch
-import gc
-from diskcache import Cache
-import time
-from transformers import AutoTokenizer
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Persistent directory
-persistent_dir = "/data/hf_cache"
-os.makedirs(persistent_dir, exist_ok=True)
-model_cache_dir = os.path.join(persistent_dir, "txagent_models")
-tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
-file_cache_dir = os.path.join(persistent_dir, "cache")
-report_dir = os.path.join(persistent_dir, "reports")
-vllm_cache_dir = os.path.join(persistent_dir, "vllm_cache")
-for directory in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir, vllm_cache_dir]:
-    os.makedirs(directory, exist_ok=True)
-os.environ["HF_HOME"] = model_cache_dir
-os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
-os.environ["VLLM_CACHE_DIR"] = vllm_cache_dir
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-current_dir = os.path.dirname(os.path.abspath(__file__))
-src_path = os.path.abspath(os.path.join(current_dir, "src"))
-sys.path.insert(0, src_path)
-from txagent.txagent import TxAgent
-# Initialize cache with 10GB limit
-cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
-# Initialize tokenizer for precise chunking
-tokenizer = AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
-def sanitize_utf8(text: str) -> str:
-    return text.encode("utf-8", "ignore").decode("utf-8")
-def file_hash(path: str) -> str:
-    with open(path, "rb") as f:
-        return hashlib.md5(f.read()).hexdigest()
-def extract_all_pages(file_path: str, progress_callback=None) -> str:
-    try:
-        with pdfplumber.open(file_path) as pdf:
-            total_pages = len(pdf.pages)
-            if total_pages == 0:
-                return ""
-        batch_size = 10
-        batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
-        text_chunks = [""] * total_pages
-        processed_pages = 0
-        def extract_batch(start: int, end: int) -> List[tuple]:
-            results = []
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages[start:end]:
-                    page_num = start + pdf.pages.index(page)
-                    page_text = page.extract_text() or ""
-                    results.append((page_num, f"=== Page {page_num + 1} ===\n{page_text.strip()}"))
-            return results
-        with ThreadPoolExecutor(max_workers=6) as executor:
-            futures = [executor.submit(extract_batch, start, end) for start, end in batches]
-            for future in as_completed(futures):
-                for page_num, text in future.result():
-                    text_chunks[page_num] = text
-                processed_pages += batch_size
-                if progress_callback:
-                    progress_callback(min(processed_pages, total_pages), total_pages)
-        return "\n\n".join(filter(None, text_chunks))
-    except Exception as e:
-        logger.error("PDF processing error: %s", e)
-        return f"PDF processing error: {str(e)}"
-def excel_to_json(file_path: str) -> List[Dict]:
-    try:
-        try:
-            df = pd.read_excel(file_path, engine='openpyxl', header=None, dtype=str)
-        except Exception:
-            df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
-        content = df.where(pd.notnull(df), "").astype(str).values.tolist()
-        return [{
-            "filename": os.path.basename(file_path),
-            "rows": content,
-            "type": "excel"
-        }]
-    except Exception as e:
-        logger.error(f"Error processing Excel file: {e}")
-        return [{"error": f"Error processing Excel file: {str(e)}"}]
-def csv_to_json(file_path: str) -> List[Dict]:
-    try:
-        chunks = []
-        for chunk in pd.read_csv(
-            file_path,
-            header=None,
-            dtype=str,
-            encoding_errors='replace',
-            on_bad_lines='skip',
-            chunksize=10000
-        ):
-            chunks.append(chunk)
-        df = pd.concat(chunks) if chunks else pd.DataFrame()
-        content = df.where(pd.notnull(df), "").astype(str).values.tolist()
-        return [{
-            "filename": os.path.basename(file_path),
-            "rows": content,
-            "type": "csv"
-        }]
-    except Exception as e:
-        logger.error(f"Error processing CSV file: {e}")
-        return [{"error": f"Error processing CSV file: {str(e)}"}]
-def process_file(file_path: str, file_type: str) -> List[Dict]:
-    try:
-        if file_type == "pdf":
-            text = extract_all_pages(file_path)
-            return [{
-                "filename": os.path.basename(file_path),
-                "content": text,
-                "status": "initial",
-                "type": "pdf"
-            }]
-        elif file_type in ["xls", "xlsx"]:
-            return excel_to_json(file_path)
-        elif file_type == "csv":
-            return csv_to_json(file_path)
-        else:
-            return [{"error": f"Unsupported file type: {file_type}"}]
-    except Exception as e:
-        logger.error("Error processing %s: %s", os.path.basename(file_path), e)
-        return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
-def tokenize_and_chunk(text: str, max_tokens: int = 1800) -> List[str]:
-    tokens = tokenizer.encode(text)
-    chunks = []
-    for i in range(0, len(tokens), max_tokens):
-        chunk_tokens = tokens[i:i + max_tokens]
-        chunks.append(tokenizer.decode(chunk_tokens))
-    return chunks
-def log_system_usage(tag=""):
-    try:
-        cpu = psutil.cpu_percent(interval=1)
-        mem = psutil.virtual_memory()
-        logger.info("[%s] CPU: %.1f%% | RAM: %dMB / %dMB", tag, cpu, mem.used // (1024**2), mem.total // (1024**2))
-        result = subprocess.run(
-            ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
-            capture_output=True, text=True
-        )
-        if result.returncode == 0:
-            used, total, util = result.stdout.strip().split(", ")
-            logger.info("[%s] GPU: %sMB / %sMB | Utilization: %s%%", tag, used, total, util)
-    except Exception as e:
-        logger.error("[%s] GPU/CPU monitor failed: %s", tag, e)
-def clean_response(text: str) -> str:
-    text = sanitize_utf8(text)
-    text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
-    diagnoses = []
-    lines = text.splitlines()
-    in_diagnoses_section = False
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        if re.match(r"###\s*Missed Diagnoses", line):
-            in_diagnoses_section = True
-            continue
-        if re.match(r"###\s*(Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line):
-            in_diagnoses_section = False
-            continue
-        if in_diagnoses_section and re.match(r"-\s*.+", line):
-            diagnosis = re.sub(r"^\-\s*", "", line).strip()
-            if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
-                diagnoses.append(diagnosis)
-    text = " ".join(diagnoses)
-    text = re.sub(r"\s+", " ", text).strip()
-    text = re.sub(r"[^\w\s\.\,\(\)\-]", "", text)
-    return text if text else ""
-def summarize_findings(combined_response: str) -> str:
-    chunks = combined_response.split("--- Analysis for Chunk")
-    diagnoses = []
-    for chunk in chunks:
-        chunk = chunk.strip()
-        if not chunk or "No oversights identified" in chunk:
-            continue
-        lines = chunk.splitlines()
-        in_diagnoses_section = False
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            if re.match(r"###\s*Missed Diagnoses", line):
-                in_diagnoses_section = True
-                continue
-            if re.match(r"###\s*(Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line):
-                in_diagnoses_section = False
-                continue
-            if in_diagnoses_section and re.match(r"-\s*.+", line):
-                diagnosis = re.sub(r"^\-\s*", "", line).strip()
-                if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
-                    diagnoses.append(diagnosis)
-    seen = set()
-    unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
-    if not unique_diagnoses:
-        return "No missed diagnoses were identified in the provided records."
-    summary = "Missed diagnoses include " + ", ".join(unique_diagnoses[:-1])
-    if len(unique_diagnoses) > 1:
-        summary += f", and {unique_diagnoses[-1]}"
-    elif len(unique_diagnoses) == 1:
-        summary = "Missed diagnoses include " + unique_diagnoses[0]
-    summary += ", all of which require urgent clinical review to prevent potential adverse outcomes."
-    return summary.strip()
-def update_progress(current: int, total: int, stage: str = "") -> Dict[str, Any]:
-    progress = f"{stage} - {current}/{total}" if stage else f"{current}/{total}"
-    return {"value": progress, "visible": True, "label": f"Progress: {progress}"}
-def init_agent():
-    logger.info("Initializing model...")
-    log_system_usage("Before Load")
-    default_tool_path = os.path.abspath("data/new_tool.json")
-    target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
-    if not os.path.exists(target_tool_path):
-        shutil.copy(default_tool_path, target_tool_path)
-    agent = TxAgent(
-        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
-        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
-        tool_files_dict={"new_tool": target_tool_path},
-        force_finish=True,
-        enable_checker=False,
-        step_rag_num=4,
-        seed=100,
-        additional_default_tools=[],
-    )
-    agent.init_model()
-    log_system_usage("After Load")
-    logger.info("Agent Ready")
-    return agent
-def process_response_stream(prompt: str, history: List[dict]) -> Generator[dict, None, None]:
-    full_response = ""
-    for chunk_output in agent.run_gradio_chat(prompt, [], 0.2, 512, 2048, False, []):
-        if chunk_output is None:
-            continue
-        if isinstance(chunk_output, list):
-            for m in chunk_output:
-                if hasattr(m, 'content') and m.content:
-                    cleaned = clean_response(m.content)
-                    if cleaned:
-                        full_response += cleaned + " "
-                        yield {"role": "assistant", "content": full_response}
-        elif isinstance(chunk_output, str) and chunk_output.strip():
-            cleaned = clean_response(chunk_output)
-            if cleaned:
-                full_response += cleaned + " "
-                yield {"role": "assistant", "content": full_response}
-    return full_response
-def analyze(message: str, history: List[dict], files: List) -> Generator[tuple, None, None]:
-    # Initialize outputs
-    chatbot_output = history.copy()
-    download_output = None
-    final_summary = ""
-    progress_text = {"value": "Starting analysis...", "visible": True}
     try:
-        # Start with user message
-        chatbot_output.append({"role": "user", "content": message})
-        yield (chatbot_output, download_output, final_summary, progress_text)
         extracted = []
         file_hash_value = ""
@@ -319,175 +43,70 @@ def analyze(message: str, history: List[dict], files: List) -> Generator[tuple,
                 for i, future in enumerate(as_completed(futures), 1):
                     try:
                         extracted.extend(future.result())
-                        progress_text = update_progress(i, len(files), "Processing files")
-                        yield (chatbot_output, download_output, final_summary, progress_text)
                     except Exception as e:
                         logger.error(f"File processing error: {e}")
                         extracted.append({"error": f"Error processing file: {str(e)}"})
             file_hash_value = file_hash(files[0].name) if files else ""
-            chatbot_output.append({"role": "assistant", "content": "✅ File processing complete"})
-            progress_text = update_progress(len(files), len(files), "Files processed")
-            yield (chatbot_output, download_output, final_summary, progress_text)
-        # Convert extracted data to JSON text
         text_content = "\n".join(json.dumps(item) for item in extracted)
-        # Tokenize and chunk the content properly
         chunks = tokenize_and_chunk(text_content)
         combined_response = ""
         for chunk_idx, chunk in enumerate(chunks, 1):
-            prompt = f"""
-Analyze the patient record excerpt for missed diagnoses only. Provide a concise, evidence-based summary as a single paragraph without headings or bullet points. Include specific clinical findings (e.g., 'elevated blood pressure (160/95) on page 10'), their potential implications (e.g., 'may indicate untreated hypertension'), and a recommendation for urgent review. Do not include other oversight categories like medication conflicts. If no missed diagnoses are found, state 'No missed diagnoses identified' in a single sentence.
-Patient Record Excerpt (Chunk {chunk_idx} of {len(chunks)}):
-{chunk[:1800]}
-"""
-            # Create a placeholder message
-            chatbot_output.append({"role": "assistant", "content": ""})
-            progress_text = update_progress(chunk_idx, len(chunks), "Analyzing")
-            yield (chatbot_output, download_output, final_summary, progress_text)
-            # Process and stream the response
             chunk_response = ""
-            for update in process_response_stream(prompt, chatbot_output):
-                chatbot_output[-1] = update
                 chunk_response = update["content"]
-                progress_text = update_progress(chunk_idx, len(chunks), "Analyzing")
-                yield (chatbot_output, download_output, final_summary, progress_text)
             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
-            # Clean up memory
             torch.cuda.empty_cache()
             gc.collect()
-        # Generate final summary
-        final_summary = summarize_findings(combined_response)
         report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
         if report_path:
             with open(report_path, "w", encoding="utf-8") as f:
-                f.write(combined_response + "\n\n" + final_summary)
-        download_output = report_path if report_path and os.path.exists(report_path) else None
-        progress_text = {"visible": False}
-        yield (chatbot_output, download_output, final_summary, progress_text)
     except Exception as e:
         logger.error("Analysis error: %s", e)
-        chatbot_output.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
-        final_summary = f"Error occurred during analysis: {str(e)}"
-        progress_text = {"visible": False}
-        yield (chatbot_output, download_output, final_summary, progress_text)
-def clear_and_start():
-    return [
-        [],  # chatbot
-        None,  # download_output
-        "",  # final_summary
-        "",  # msg_input
-        None,  # file_upload
-        {"visible": False}  # progress_text
-    ]
-def create_ui(agent):
-    with gr.Blocks(theme=gr.themes.Soft(), title="Clinical Oversight Assistant") as demo:
-        gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
-        with gr.Row():
-            with gr.Column(scale=3):
-                chatbot = gr.Chatbot(
-                    label="Analysis Conversation",
-                    height=600,
-                    show_copy_button=True,
-                    avatar_images=(
-                        "assets/user.png",
-                        "assets/assistant.png"
-                    ) if os.path.exists("assets/user.png") else None,
-                    type="messages",  # Use openai-style messages
-                    render=False
-                )
-            with gr.Column(scale=1):
-                final_summary = gr.Markdown(
-                    label="Summary of Findings",
-                    value="### Summary will appear here\nAfter analysis completes"
-                )
-                download_output = gr.File(
-                    label="Download Full Report",
-                    visible=False
-                )
-        with gr.Row():
-            file_upload = gr.File(
-                file_types=[".pdf", ".csv", ".xls", ".xlsx"],
-                file_count="multiple",
-                label="Upload Patient Records"
-            )
-        with gr.Row():
-            msg_input = gr.Textbox(
-                placeholder="Ask about potential oversights...",
-                show_label=False,
-                container=False,
-                scale=7,
-                autofocus=True
-            )
-            send_btn = gr.Button(
-                "Analyze",
-                variant="primary",
-                scale=1,
-                min_width=100
-            )
-        progress_text = gr.Textbox(
-            label="Progress",
-            visible=False,
-            interactive=False
-        )
-        # Event handlers
-        send_btn.click(
-            analyze,
-            inputs=[msg_input, chatbot, file_upload],
-            outputs=[chatbot, download_output, final_summary, progress_text],
-            show_progress="hidden"
-        )
-        msg_input.submit(
-            analyze,
-            inputs=[msg_input, chatbot, file_upload],
-            outputs=[chatbot, download_output, final_summary, progress_text],
-            show_progress="hidden"
-        )
-        demo.load(
-            clear_and_start,
-            outputs=[chatbot, download_output, final_summary, msg_input, file_upload, progress_text],
-            queue=False
-        )
-    return demo
-if __name__ == "__main__":
-    try:
-        logger.info("Launching app...")
-        agent = init_agent()
-        demo = create_ui(agent)
-        demo.queue(
-            api_open=False,
-            max_size=20
-        ).launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            show_error=True,
-            allowed_paths=[report_dir],
-            share=False
-        )
-    except Exception as e:
-        logger.error(f"Failed to launch app: {e}")
-        raise
-    finally:
-        if torch.distributed.is_initialized():
-            torch.distributed.destroy_process_group()

+# Update the Chatbot component in create_ui() to use the new message format:
+chatbot = gr.Chatbot(
+    label="Analysis Conversation",
+    height=600,
+    show_copy_button=True,
+    avatar_images=(
+        "assets/user.png",
+        "assets/assistant.png"
+    ) if os.path.exists("assets/user.png") else None,
+    render=False,
+    bubble_full_width=False,
+    type="messages"  # Add this line to use the new format
+)
+# Update the analyze function to properly return all outputs:
+def analyze(message: str, history: List[dict], files: List) -> Generator[Dict[str, Any], None, None]:
+    # Initialize all outputs
+    outputs = {
+        "chatbot": history.copy(),
+        "download_output": None,
+        "final_summary": "",
+        "progress_text": {"value": "Starting analysis...", "visible": True}
+    }
+    yield outputs  # First yield with all outputs
     try:
+        # Add user message to history
+        history.append({"role": "user", "content": message})
+        outputs["chatbot"] = history
+        yield outputs
         extracted = []
         file_hash_value = ""
                 for i, future in enumerate(as_completed(futures), 1):
                     try:
                         extracted.extend(future.result())
+                        outputs["progress_text"] = update_progress(i, len(files), "Processing files")
+                        yield outputs
                     except Exception as e:
                         logger.error(f"File processing error: {e}")
                         extracted.append({"error": f"Error processing file: {str(e)}"})
             file_hash_value = file_hash(files[0].name) if files else ""
+            history.append({"role": "assistant", "content": "✅ File processing complete"})
+            outputs.update({
+                "chatbot": history,
+                "progress_text": update_progress(len(files), len(files), "Files processed")
+            })
+            yield outputs
+        # Process content and generate responses
         text_content = "\n".join(json.dumps(item) for item in extracted)
         chunks = tokenize_and_chunk(text_content)
         combined_response = ""
         for chunk_idx, chunk in enumerate(chunks, 1):
+            prompt = f"""Analyze this patient record for missed diagnoses..."""  # Your prompt here
+            history.append({"role": "assistant", "content": ""})
+            outputs.update({
+                "chatbot": history,
+                "progress_text": update_progress(chunk_idx, len(chunks), "Analyzing")
+            })
+            yield outputs
+            # Process response stream
             chunk_response = ""
+            for update in process_response_stream(prompt, history):
+                history[-1] = update
                 chunk_response = update["content"]
+                outputs.update({
+                    "chatbot": history,
+                    "progress_text": update_progress(chunk_idx, len(chunks), "Analyzing")
+                })
+                yield outputs
             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
             torch.cuda.empty_cache()
             gc.collect()
+        # Final outputs
+        summary = summarize_findings(combined_response)
         report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
         if report_path:
             with open(report_path, "w", encoding="utf-8") as f:
+                f.write(combined_response + "\n\n" + summary)
+        outputs.update({
+            "download_output": report_path if report_path else None,
+            "final_summary": summary,
+            "progress_text": {"visible": False}
+        })
+        yield outputs
     except Exception as e:
         logger.error("Analysis error: %s", e)
+        history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
+        outputs.update({
+            "chatbot": history,
+            "final_summary": f"Error occurred: {str(e)}",
+            "progress_text": {"visible": False}
+        })
+        yield outputs