CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on 24 days ago

Commit

1a611b9

verified ·

1 Parent(s): de75e20

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -201

app.py CHANGED Viewed

@@ -3,37 +3,33 @@ import os
 import pandas as pd
 import json
 import gradio as gr
-from typing import List, Tuple, Union, Generator, BinaryIO, Dict, Any
 import re
 from datetime import datetime
 import atexit
 import torch.distributed as dist
 import logging
-# Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Cleanup for PyTorch distributed
 def cleanup():
     if dist.is_initialized():
         logger.info("Cleaning up PyTorch distributed process group")
         dist.destroy_process_group()
 atexit.register(cleanup)
-# Setup directories
 persistent_dir = "/data/hf_cache"
 os.makedirs(persistent_dir, exist_ok=True)
 model_cache_dir = os.path.join(persistent_dir, "txagent_models")
 tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
 file_cache_dir = os.path.join(persistent_dir, "cache")
 report_dir = os.path.join(persistent_dir, "reports")
 for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir]:
     os.makedirs(d, exist_ok=True)
 os.environ["HF_HOME"] = model_cache_dir
 os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
@@ -55,50 +51,40 @@ def estimate_tokens(text: str) -> int:
     return len(text) // 3.5 + 1
 def extract_text_from_excel(file_obj: Union[str, Dict[str, Any]]) -> str:
-    """Handle Gradio file upload object which is a dictionary with 'name' and other keys"""
     all_text = []
-    try:
-        if isinstance(file_obj, dict) and 'name' in file_obj:
-            file_path = file_obj['name']
-        elif isinstance(file_obj, str):
-            file_path = file_obj
-        else:
-            raise ValueError("Unsupported file input type")
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"Temporary upload file not found at: {file_path}")
-        xls = pd.ExcelFile(file_path)
-        for sheet_name in xls.sheet_names:
-            try:
-                df = xls.parse(sheet_name).astype(str).fillna("")
-                rows = df.apply(lambda row: " | ".join([cell for cell in row if cell.strip()]), axis=1)
-                sheet_text = [f"[{sheet_name}] {line}" for line in rows if line.strip()]
-                all_text.extend(sheet_text)
-            except Exception as e:
-                logger.warning(f"Could not parse sheet {sheet_name}: {e}")
-                continue
-        return "\n".join(all_text)
-    except Exception as e:
-        raise ValueError(f"❌ Error processing Excel file: {str(e)}")
 def split_text_into_chunks(text: str) -> List[str]:
-    effective_max = MAX_CHUNK_TOKENS - PROMPT_OVERHEAD
-    lines, chunks, curr_chunk, curr_tokens = text.split("\n"), [], [], 0
     for line in lines:
         t = estimate_tokens(line)
-        if curr_tokens + t > effective_max:
-            if curr_chunk:
-                chunks.append("\n".join(curr_chunk))
-            curr_chunk, curr_tokens = [line], t
         else:
-            curr_chunk.append(line)
-            curr_tokens += t
-    if curr_chunk:
-        chunks.append("\n".join(curr_chunk))
     return chunks
 def build_prompt_from_text(chunk: str) -> str:
@@ -120,196 +106,113 @@ Provide a structured response with clear medical reasoning.
 """
 def validate_tool_file(tool_name: str, tool_path: str) -> bool:
-    """Validate the structure of a tool JSON file. Return True if valid, False if invalid."""
     try:
         if not os.path.exists(tool_path):
-            logger.error(f"Tool file not found: {tool_path}")
             return False
         with open(tool_path, 'r') as f:
             tool_data = json.load(f)
-        logger.info(f"Contents of {tool_name} ({tool_path}): {tool_data}")
-        if isinstance(tool_data, str):
-            logger.error(f"Invalid tool file {tool_name}: JSON root is a string, expected list or dict")
-            return False
-        elif isinstance(tool_data, list):
-            for item in tool_data:
-                if not isinstance(item, dict):
-                    logger.error(f"Invalid tool format in {tool_name}: each item must be a dict, got {type(item)}: {item}")
-                    return False
-                if 'name' not in item:
-                    logger.error(f"Invalid tool format in {tool_name}: each dict must have a 'name' key, got {item}")
-                    return False
         elif isinstance(tool_data, dict):
             if 'tools' in tool_data:
-                if not isinstance(tool_data['tools'], list):
-                    logger.error(f"'tools' field in {tool_name} must be a list, got {type(tool_data['tools'])}")
-                    return False
-                for item in tool_data['tools']:
-                    if not isinstance(item, dict):
-                        logger.error(f"Invalid tool format in {tool_name}: each tool must be a dict, got {type(item)}: {item}")
-                        return False
-                    if 'name' not in item:
-                        logger.error(f"Invalid tool format in {tool_name}: each tool dict must have a 'name' key, got {item}")
-                        return False
-            else:
-                if 'name' not in tool_data:
-                    logger.error(f"Invalid tool format in {tool_name}: dict must have a 'name' key or 'tools' field, got {tool_data}")
-                    return False
-        else:
-            logger.error(f"Invalid tool file {tool_name}: must be a list or dict, got {type(tool_data)}")
-            return False
-        return True
     except Exception as e:
-        logger.error(f"Error validating tool file {tool_name} ({tool_path}): {str(e)}")
         return False
 def init_agent() -> TxAgent:
-    tool_path = os.path.join(tool_cache_dir, "new_tool.json")
-    logger.info(f"Checking for tool file at: {tool_path}")
-    # Create default tool file if it doesn't exist
-    if not os.path.exists(tool_path):
-        default_tool = {
-            "name": "new_tool",
-            "description": "Default tool configuration",
-            "version": "1.0",
-            "tools": [
-                {"name": "dummy_tool", "description": "Dummy tool for testing", "version": "1.0"}
-            ]
-        }
-        logger.info(f"Creating default tool file at: {tool_path}")
-        with open(tool_path, 'w') as f:
-            json.dump(default_tool, f)
-    # Define tool files
-    tool_files_dict = {
         'opentarget': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/opentarget_tools.json',
         'fda_drug_label': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/fda_drug_labeling_tools.json',
         'special_tools': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/special_tools.json',
         'monarch': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/monarch_tools.json',
-        'new_tool': tool_path
     }
-    # Validate all tool files and filter invalid ones
-    valid_tool_files = {}
-    for tool_name, tool_path in tool_files_dict.items():
-        if validate_tool_file(tool_name, tool_path):
-            valid_tool_files[tool_name] = tool_path
-        else:
-            logger.warning(f"Skipping invalid tool file: {tool_name} ({tool_path})")
-    if not valid_tool_files:
-        raise ValueError("No valid tool files found after validation")
-    # For testing, you can use only new_tool.json to isolate the issue
-    # valid_tool_files = {'new_tool': tool_path}
-    # Initialize TxAgent
-    try:
-        logger.info(f"Initializing TxAgent with tool_files_dict: {valid_tool_files}")
-        agent = TxAgent(
-            model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
-            rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
-            tool_files_dict=valid_tool_files,
-            force_finish=True,
-            enable_checker=True,
-            step_rag_num=4,
-            seed=100
-        )
-        logger.info("TxAgent initialized, calling init_model")
-        agent.init_model()
-        logger.info("TxAgent model initialized successfully")
-        return agent
-    except Exception as e:
-        logger.error(f"Error initializing TxAgent: {str(e)}", exc_info=True)
-        raise
 def stream_report(agent: TxAgent, input_file: Union[str, Dict[str, Any]], full_output: str) -> Generator[Tuple[str, Union[str, None], str], None, None]:
-    accumulated_text = ""
     try:
-        if input_file is None:
-            yield "❌ Please upload a valid Excel file.", None, ""
-            return
-        try:
-            text = extract_text_from_excel(input_file)
-            chunks = split_text_into_chunks(text)
-        except Exception as e:
-            yield f"❌ {str(e)}", None, ""
-            return
-        for i, chunk in enumerate(chunks):
-            prompt = build_prompt_from_text(chunk)
-            partial = ""
-            for res in agent.run_gradio_chat(
-                message=prompt, history=[], temperature=0.2,
-                max_new_tokens=MAX_NEW_TOKENS, max_token=MAX_MODEL_TOKENS,
-                call_agent=False, conversation=[]
-            ):
-                partial += res if isinstance(res, str) else res.content
-            cleaned = clean_response(partial)
-            accumulated_text += f"\n\n📄 Analysis Part {i+1}:\n{cleaned}"
-            yield accumulated_text, None, ""
-        summary_prompt = f"Please summarize this analysis:\n\n{accumulated_text}"
-        final_report = ""
-        for res in agent.run_gradio_chat(
-            message=summary_prompt, history=[], temperature=0.2,
             max_new_tokens=MAX_NEW_TOKENS, max_token=MAX_MODEL_TOKENS,
             call_agent=False, conversation=[]
         ):
-            final_report += res if isinstance(res, str) else res.content
-        cleaned = clean_response(final_report)
-        report_path = os.path.join(report_dir, f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
-        with open(report_path, 'w') as f:
-            f.write(f"# Clinical Analysis Report\n\n{cleaned}")
-        yield f"{accumulated_text}\n\n📊 Final Summary:\n{cleaned}", report_path, cleaned
-    except Exception as e:
-        logger.error(f"Processing error in stream_report: {str(e)}", exc_info=True)
-        yield f"❌ Processing error: {str(e)}", None, ""
 def create_ui(agent: TxAgent) -> gr.Blocks:
-    with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 900px !important}") as demo:
-        gr.Markdown("""# Clinical Records Analyzer""")
         with gr.Row():
             file_upload = gr.File(label="Upload Excel File", file_types=[".xlsx"])
             analyze_btn = gr.Button("Analyze", variant="primary")
         with gr.Row():
             with gr.Column(scale=2):
                 report_output = gr.Markdown()
             with gr.Column(scale=1):
-                report_file = gr.File(label="Download Report", visible=False)
         full_output = gr.State()
-        analyze_btn.click(
-            fn=stream_report,
-            inputs=[file_upload, full_output],
-            outputs=[report_output, report_file, full_output]
-        )
     return demo
 if __name__ == "__main__":
     try:
         agent = init_agent()
         demo = create_ui(agent)
-        logger.info("Launching Gradio UI")
-        demo.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=False
-        )
     except Exception as e:
-        logger.error(f"Application error: {str(e)}", exc_info=True)
-        print(f"Application error: {str(e)}", file=sys.stderr)
-        sys.exit(1)

 import pandas as pd
 import json
 import gradio as gr
+from typing import List, Tuple, Union, Generator, Dict, Any
 import re
 from datetime import datetime
 import atexit
 import torch.distributed as dist
 import logging
+# Logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# PyTorch cleanup
 def cleanup():
     if dist.is_initialized():
         logger.info("Cleaning up PyTorch distributed process group")
         dist.destroy_process_group()
 atexit.register(cleanup)
+# Directories
 persistent_dir = "/data/hf_cache"
 os.makedirs(persistent_dir, exist_ok=True)
 model_cache_dir = os.path.join(persistent_dir, "txagent_models")
 tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
 file_cache_dir = os.path.join(persistent_dir, "cache")
 report_dir = os.path.join(persistent_dir, "reports")
 for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir]:
     os.makedirs(d, exist_ok=True)
 os.environ["HF_HOME"] = model_cache_dir
 os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
     return len(text) // 3.5 + 1
 def extract_text_from_excel(file_obj: Union[str, Dict[str, Any]]) -> str:
+    if isinstance(file_obj, dict) and 'name' in file_obj:
+        file_path = file_obj['name']
+    elif isinstance(file_obj, str):
+        file_path = file_obj
+    else:
+        raise ValueError("Unsupported file input type")
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    xls = pd.ExcelFile(file_path)
     all_text = []
+    for sheet in xls.sheet_names:
+        try:
+            df = xls.parse(sheet).astype(str).fillna("")
+            rows = df.apply(lambda r: " | ".join([c for c in r if c.strip()]), axis=1)
+            sheet_text = [f"[{sheet}] {line}" for line in rows if line.strip()]
+            all_text.extend(sheet_text)
+        except Exception as e:
+            logger.warning(f"Failed to parse {sheet}: {e}")
+    return "\n".join(all_text)
 def split_text_into_chunks(text: str) -> List[str]:
+    lines = text.split("\n")
+    chunks, current, current_tokens = [], [], 0
+    max_tokens = MAX_CHUNK_TOKENS - PROMPT_OVERHEAD
     for line in lines:
         t = estimate_tokens(line)
+        if current_tokens + t > max_tokens:
+            chunks.append("\n".join(current))
+            current, current_tokens = [line], t
         else:
+            current.append(line)
+            current_tokens += t
+    if current:
+        chunks.append("\n".join(current))
     return chunks
 def build_prompt_from_text(chunk: str) -> str:
 """
 def validate_tool_file(tool_name: str, tool_path: str) -> bool:
     try:
         if not os.path.exists(tool_path):
+            logger.error(f"Missing tool file: {tool_path}")
             return False
         with open(tool_path, 'r') as f:
             tool_data = json.load(f)
+        if isinstance(tool_data, list):
+            return all(isinstance(item, dict) and 'name' in item for item in tool_data)
         elif isinstance(tool_data, dict):
             if 'tools' in tool_data:
+                return all(isinstance(item, dict) and 'name' in item for item in tool_data['tools'])
+            return 'name' in tool_data
+        logger.error(f"Invalid format in tool: {tool_name}")
+        return False
     except Exception as e:
+        logger.error(f"Error in {tool_name}: {e}")
         return False
 def init_agent() -> TxAgent:
+    new_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
+    if not os.path.exists(new_tool_path):
+        with open(new_tool_path, 'w') as f:
+            json.dump({
+                "name": "new_tool",
+                "description": "Default tool",
+                "tools": [{"name": "dummy_tool", "description": "test", "version": "1.0"}]
+            }, f)
+    tool_files = {
         'opentarget': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/opentarget_tools.json',
         'fda_drug_label': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/fda_drug_labeling_tools.json',
         'special_tools': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/special_tools.json',
         'monarch': '/home/user/.pyenv/versions/3.10.17/lib/python3.10/site-packages/tooluniverse/data/monarch_tools.json',
+        'new_tool': new_tool_path
     }
+    valid_tools = {k: v for k, v in tool_files.items() if validate_tool_file(k, v)}
+    if not valid_tools:
+        raise ValueError("No valid tool files")
+    agent = TxAgent(
+        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
+        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
+        tool_files_dict=valid_tools,
+        force_finish=True,
+        enable_checker=True,
+        step_rag_num=4,
+        seed=100
+    )
+    agent.init_model()
+    return agent
 def stream_report(agent: TxAgent, input_file: Union[str, Dict[str, Any]], full_output: str) -> Generator[Tuple[str, Union[str, None], str], None, None]:
+    accumulated = ""
+    if input_file is None:
+        yield "❌ Upload an Excel file.", None, ""
+        return
     try:
+        text = extract_text_from_excel(input_file)
+        chunks = split_text_into_chunks(text)
+    except Exception as e:
+        yield f"❌ Error: {str(e)}", None, ""
+        return
+    for i, chunk in enumerate(chunks):
+        prompt = build_prompt_from_text(chunk)
+        result = ""
+        for out in agent.run_gradio_chat(
+            message=prompt, history=[], temperature=0.2,
             max_new_tokens=MAX_NEW_TOKENS, max_token=MAX_MODEL_TOKENS,
             call_agent=False, conversation=[]
         ):
+            result += out if isinstance(out, str) else out.content
+        cleaned = clean_response(result)
+        accumulated += f"\n\n📄 Part {i+1}:\n{cleaned}"
+        yield accumulated, None, ""
+    summary_prompt = f"Summarize this analysis:\n\n{accumulated}"
+    summary = ""
+    for out in agent.run_gradio_chat(
+        message=summary_prompt, history=[], temperature=0.2,
+        max_new_tokens=MAX_NEW_TOKENS, max_token=MAX_MODEL_TOKENS,
+        call_agent=False, conversation=[]
+    ):
+        summary += out if isinstance(out, str) else out.content
+    final = clean_response(summary)
+    report_path = os.path.join(report_dir, f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
+    with open(report_path, 'w') as f:
+        f.write(f"# Clinical Report\n\n{final}")
+    yield f"{accumulated}\n\n📊 Final Summary:\n{final}", report_path, final
 def create_ui(agent: TxAgent) -> gr.Blocks:
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🏥 Clinical Records Analyzer")
         with gr.Row():
             file_upload = gr.File(label="Upload Excel File", file_types=[".xlsx"])
             analyze_btn = gr.Button("Analyze", variant="primary")
         with gr.Row():
             with gr.Column(scale=2):
                 report_output = gr.Markdown()
             with gr.Column(scale=1):
+                report_file = gr.File(label="Download", visible=False)
         full_output = gr.State()
+        analyze_btn.click(fn=stream_report, inputs=[file_upload, full_output], outputs=[report_output, report_file, full_output])
     return demo
 if __name__ == "__main__":
     try:
         agent = init_agent()
         demo = create_ui(agent)
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
     except Exception as e:
+        logger.error(f"App error: {e}", exc_info=True)
+        print(f"❌ Application error: {e}", file=sys.stderr)
+        sys.exit(1)