Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 7

Commit

4f7c634

verified ·

1 Parent(s): ef0811b

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -365

app.py CHANGED Viewed

@@ -1,357 +1,21 @@
 import gradio as gr
 import pandas as pd
-import fitz  # PyMuPDF
-import pytesseract
-from pdf2image import convert_from_path
-import os
-import base64
-from google import genai
-from google.genai import types
-google_api=os.getenv("google_api")
-def read_pdf(pdf_path):
-    text = ""
-    doc = fitz.open(pdf_path)
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        page_text = page.get_text("text").strip()  # Extract text from page
-        # Extract Images for OCR
-        images = page.get_images(full=True)  # Check if the page has images
-        ocr_text = ""
-        if images:  # If images exist, process them
-            print(f"Page {page_num + 1} contains images, performing OCR...")
-            img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
-            for img in img_pages:
-                ocr_text += pytesseract.image_to_string(img).strip() + "\n"
-        # Combine both text extraction methods
-        combined_text = f"{page_text}\n{ocr_text}".strip()
-        if combined_text:
-            text += combined_text + "\n\n"
-    return text.strip()
-def generate(extracted_text):
-    client = genai.Client(
-        api_key=google_api,
-    )
-    model = "gemini-2.0-flash"
-    contents = [
-        types.Content(
-            role="user",
-            parts=[
-                types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
-There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
-Please correct the errors and extract both metadata and structured lab test data.
-ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
-AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
-Return the output in structured JSON format with all the information in lowercase to standardization.
-And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
-Extracted Text:
-Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n     \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
-Expected JSON format:
-{
-    \"metadata\": {
-        \"patient_name\": \"Prasahsst Pawar\",
-        \"age\": \"20\",
-        \"gender\": \"Male\",
-        \"lab_name\": \"XYZ Diagnostics\",
-        \"report_date\": \"05-03-2025\"
-    },
-    \"lab_tests\": [
-        {
-            \"test_name\": \"hemoglobin\",
-            \"value\": \"14.2\",
-            \"unit\": \"g/dL\",
-            \"reference_range\": \"13.5 - 17.5 g/dL\"
-        },
-        {
-      \"test_name\": \"rbc count\",
-      \"value\": \"5.2\",
-      \"unit\": \"million/cu mm\",
-      \"reference_range\": \"4.1-5.1\"
-    },
-        {
-            \"test_name\": \"glucose\",
-            \"value\": \"65\",
-            \"unit\": \"mg/dL\",
-            \"reference_range\": \"70 - 110 mg/dL\"
-        }
-    ],
-\"reds\":{
-    \"low\":[\"glucose\"],
-    \"high\":[\"rbc count\"]
-}
-}"""),
-            ],
-        ),
-        types.Content(
-            role="model",
-            parts=[
-                types.Part.from_text(text="""{
-  \"lab_tests\": [
-    {
-      \"reference_range\": \"12.0 - 17.0\",
-      \"test_name\": \"haemoglobin\",
-      \"unit\": \"gms/dl\",
-      \"value\": \"14\"
-    },
-    {
-      \"reference_range\": \"4.1-5.1\",
-      \"test_name\": \"rbc count\",
-      \"unit\": \"mill/cu mm\",
-      \"value\": \"4.4\"
-    },
-    {
-      \"reference_range\": \"32.0 - 47.0\",
-      \"test_name\": \"haematocrit (pcv)\",
-      \"unit\": \"%\",
-      \"value\": \"30\"
-    },
-    {
-      \"reference_range\": \"76.0 - 100.0\",
-      \"test_name\": \"mcv\",
-      \"unit\": \"fl\",
-      \"value\": \"78\"
-    },
-    {
-      \"reference_range\": \"26.0-32.0\",
-      \"test_name\": \"mch\",
-      \"unit\": \"pg\",
-      \"value\": \"32.46\"
-    },
-    {
-      \"reference_range\": \"31.5-34.5\",
-      \"test_name\": \"mchc\",
-      \"unit\": \"%\",
-      \"value\": \"32.8\"
-    },
-    {
-      \"reference_range\": \"11.6-15.0\",
-      \"test_name\": \"rdw\",
-      \"unit\": \"%\",
-      \"value\": \"13.9\"
-    },
-    {
-      \"reference_range\": \"6.8- 12.6\",
-      \"test_name\": \"mpv\",
-      \"unit\": \"fn\",
-      \"value\": \"11.2\"
-    },
-    {
-      \"reference_range\": \"4000 - 11000\",
-      \"test_name\": \"wbc count\",
-      \"unit\": \"/cu mm\",
-      \"value\": \"4567\"
-    },
-    {
-      \"reference_range\": \"40-70\",
-      \"test_name\": \"neutrophils\",
-      \"unit\": \"%\",
-      \"value\": \"56\"
-    },
-    {
-      \"reference_range\": \"20.0- 45.0\",
-      \"test_name\": \"lymphocytes\",
-      \"unit\": \"%\",
-      \"value\": \"20\"
-    },
-    {
-      \"reference_range\": \"0-6\",
-      \"test_name\": \"eosinophils\",
-      \"unit\": \"%\",
-      \"value\": \"4\"
-    },
-    {
-      \"reference_range\": \"2-10\",
-      \"test_name\": \"monocytes\",
-      \"unit\": \"%\",
-      \"value\": \"5\"
-    }
-  ],
-  \"metadata\": {
-    \"age\": \"40\",
-    \"gender\": \"male\",
-    \"lab_name\": \"sanjeevan hospital\",
-    \"patient_name\": \"amar shaha\",
-    \"report_date\": \"09-jul-20\"
-  },
-  \"reds\": {
-    \"high\": [
-      \"mch\"
-    ],
-    \"low\": [
-      \"haematocrit (pcv)\"
-    ]
-  }
-}"""),
-            ],
-        ),
-        types.Content(
-            role="user",
-            parts=[
-                types.Part.from_text(text=extracted_text),
-            ],
-        ),
-    ]
-    generate_content_config = types.GenerateContentConfig(
-        temperature=1,
-        top_p=0.95,
-        top_k=40,
-        max_output_tokens=8192,
-        response_mime_type="application/json",
-        response_schema=genai.types.Schema(
-            type = genai.types.Type.OBJECT,
-            enum = [],
-            required = ["metadata", "lab_tests", "reds"],
-            properties = {
-                "metadata": genai.types.Schema(
-                    type = genai.types.Type.OBJECT,
-                    enum = [],
-                    required = ["patient_name", "age", "gender", "lab_name", "report_date"],
-                    properties = {
-                        "patient_name": genai.types.Schema(
-                            type = genai.types.Type.STRING,
-                        ),
-                        "age": genai.types.Schema(
-                            type = genai.types.Type.STRING,
-                        ),
-                        "gender": genai.types.Schema(
-                            type = genai.types.Type.STRING,
-                        ),
-                        "lab_name": genai.types.Schema(
-                            type = genai.types.Type.STRING,
-                        ),
-                        "report_date": genai.types.Schema(
-                            type = genai.types.Type.STRING,
-                        ),
-                    },
-                ),
-                "lab_tests": genai.types.Schema(
-                    type = genai.types.Type.ARRAY,
-                    items = genai.types.Schema(
-                        type = genai.types.Type.OBJECT,
-                        enum = [],
-                        required = ["test_name", "value", "unit", "reference_range"],
-                        properties = {
-                            "test_name": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                            "value": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                            "unit": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                            "reference_range": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                        },
-                    ),
-                ),
-                "reds": genai.types.Schema(
-                    type = genai.types.Type.OBJECT,
-                    enum = [],
-                    required = ["low", "high"],
-                    properties = {
-                        "low": genai.types.Schema(
-                            type = genai.types.Type.ARRAY,
-                            items = genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                        ),
-                        "high": genai.types.Schema(
-                            type = genai.types.Type.ARRAY,
-                            items = genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                        ),
-                    },
-                ),
-            },
-        ),
-        system_instruction=[
-            types.Part.from_text(text="""Always return the output as JSON only"""),
-        ],
-    )
-    # for chunk in client.models.generate_content_stream(
-    #     model=model,
-    #     contents=contents,
-    #     config=generate_content_config,
-    # ):
-    #     print(chunk.text, end="")
-    response = client.models.generate_content(
-        model=model,
-        contents=contents,
-        config=generate_content_config,
-    )
-    json_response = response.text  # The API should return JSON text
-    parsed_json = json.loads(json_response)  # Convert JSON string to Python dictionary
-    return parsed_json
-# Gradio interface function
-def process_pdf(pdf):
-    text = read_pdf(pdf)  # Extract text from PDF
-    # # print(text)
-    output = generate(text)  # Generate structured JSON
-    return output
-def show_to_UI(pdf):
-    output = process_pdf(pdf)  # Call process_pdf to get JSON
-    # Extract metadata
     metadata = output["metadata"]
-    # labtests = pd.DataFrame(output["lab_tests"])
-    # reds = pd.DataFrame(output["reds"])
-    try:
-      labtests = pd.DataFrame(output["lab_tests"],)
-    except Exception as e:
-      print(f"Error creating lab tests DataFrame: {e}")
-      labtests = pd.DataFrame()  # Return empty DataFrame
-    try:
-      highs = pd.DataFrame(output["reds"]["high"],index=True)
-    except Exception as e:
-      print(f"Error creating highs DataFrame: {e}")
-      highs = pd.DataFrame()  # Return empty DataFrame
-    try:
-      lows = pd.DataFrame(output["reds"]["low"],)
-    except Exception as e:
-      print(f"Error creating lowss DataFrame: {e}")
-      lows = pd.DataFrame()  # Return empty DataFrame
     metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
                    f"**Age:** {metadata['age']}\n\n" \
@@ -359,27 +23,29 @@ def show_to_UI(pdf):
                    f"**Lab Name:** {metadata['lab_name']}\n\n" \
                    f"**Report Date:** {metadata['report_date']}"
-    return metadata_str,highs,lows, labtests,output
-# Define Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Medical Lab Report Processor")
     with gr.Row():
-        pdf_input = gr.File(label="Upload PDF Report")
         submit_btn = gr.Button("Process")
-    metadata_output = gr.Markdown("**Patient Name: Prashasst...**")
     with gr.Row():
-      high_output = gr.Dataframe(label="High Values")
-      low_output = gr.Dataframe(label="Low Values")
-    lab_test_output = gr.Dataframe(label="Lab Test Results")
-    output_JSON = gr.JSON(label="Extracted Report")  # Show JSON output
-    submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output,lab_test_output, output_JSON])
-demo.launch(debug=True,share=True)

 import gradio as gr
 import pandas as pd
+from file_processing import FileProcessor
+from entity_recognition import process_text
+from utils import safe_dataframe
+def show_to_UI(file):
+    """Processes the uploaded file and extracts medical data."""
+    processor = FileProcessor()
+    text = processor.process(file.name)  # Read content
+    output = process_text(text)  # Perform entity recognition
     metadata = output["metadata"]
+    # Convert extracted data safely
+    highs = safe_dataframe(output["reds"], "high")
+    lows = safe_dataframe(output["reds"], "low")
+    labtests = safe_dataframe(output, "lab_tests")
     metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
                    f"**Age:** {metadata['age']}\n\n" \
                    f"**Lab Name:** {metadata['lab_name']}\n\n" \
                    f"**Report Date:** {metadata['report_date']}"
+    print(f"Processed report for {metadata['patient_name']}")
+    return metadata_str, highs, lows, labtests, output
+# ✅ Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# 🏥 Medical Lab Report Processor")
     with gr.Row():
+        pdf_input = gr.File(label="📂 Upload Report")
         submit_btn = gr.Button("Process")
+    metadata_output = gr.Markdown("**Patient Name: Loading...**")
     with gr.Row():
+        high_output = gr.Dataframe(label="🔺 High Values")
+        low_output = gr.Dataframe(label="🔻 Low Values")
+    lab_test_output = gr.Dataframe(label="📊 Lab Test Results")
+    output_JSON = gr.JSON(label="📜 Extracted Report")
+    submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output, lab_test_output, output_JSON])
+demo.launch()