Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 6

Commit

52bb493

verified ·

1 Parent(s): b73ad8a

Update app.py

Browse files

Added JSON to api

Files changed (1) hide show

app.py +301 -20

app.py CHANGED Viewed

@@ -1,30 +1,305 @@
 import gradio as gr
 import pandas as pd
 import os
 googel_api=os.getenv("google_api")
-# Your existing functions: read_pdf, generate, showdata
 def read_pdf(pdf_path):
-    # Implement PDF reading logic here
-    return "Extracted text from PDF"
-def generate(text):
-    # Implement JSON generation logic here
-    return {
-        "metadata": {
-            "patient_name": "Amar Shaha",
-            "age": "40",
-            "gender": "Male",
-            "lab_name": "Sanjeevan Hospital",
-            "report_date": "09-Jul-2020"
         },
-        "lab_tests": [
-            {"test_name": "hemoglobin", "value": "14", "unit": "g/dL", "reference_range": "12.0 - 17.0"},
-            {"test_name": "rbc count", "value": "4.4", "unit": "million/cu mm", "reference_range": "4.1 - 5.1"}
-        ]
     }
 def showdata(lab_tests):
     df = pd.DataFrame(lab_tests)
@@ -33,17 +308,19 @@ def showdata(lab_tests):
 # Gradio interface function
 def process_pdf(pdf):
     text = read_pdf(pdf.name)  # Extract text from PDF
-    json_data = generate(text)  # Generate structured JSON
     metadata = json_data["metadata"]
     metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
                    f"**Age:** {metadata['age']}\n\n" \
                    f"**Gender:** {metadata['gender']}\n\n" \
                    f"**Lab Name:** {metadata['lab_name']}\n\n" \
                    f"**Report Date:** {metadata['report_date']}"
-    lab_tests_df = showdata(json_data["lab_tests"])  # Convert lab test results to DataFrame
-    return metadata_str, lab_tests_df
 # Define Gradio interface
 with gr.Blocks() as demo:
@@ -58,5 +335,9 @@ with gr.Blocks() as demo:
     submit_btn.click(process_pdf, inputs=[pdf_input], outputs=[metadata_output, lab_test_output])
 # Launch the app
 demo.launch()

 import gradio as gr
 import pandas as pd
 import os
+import fitz
+import pytesseract
+import base64
+from google import genai
+from google.genai import types
 googel_api=os.getenv("google_api")
 def read_pdf(pdf_path):
+    text = ""
+    doc = fitz.open(pdf_path)
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        page_text = page.get_text()
+        if page_text.strip():
+            text += page_text + "\n"
+        else:
+            # print(f"Image found in Page {page_num + 1} Performing OCR...")
+            images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
+            for img in images:
+                text += pytesseract.image_to_string(img) + "\n"
+    # print(f"Extracted text preview:\n{text[:600]}...")
+    return text.strip()
+def generate(extracted_text):
+    client = genai.Client(
+        api_key=google_api,
+    )
+    model = "gemini-2.0-flash"
+    contents = [
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
+There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
+Please correct the errors and extract both metadata and structured lab test data.
+ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
+AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
+Return the output in structured JSON format with all the information in lowercase to standardization.
+And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
+Extracted Text:
+Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n     \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
+Expected JSON format:
+{
+    \"metadata\": {
+        \"patient_name\": \"Prasahsst Pawar\",
+        \"age\": \"20\",
+        \"gender\": \"Male\",
+        \"lab_name\": \"XYZ Diagnostics\",
+        \"report_date\": \"05-03-2025\"
+    },
+    \"lab_tests\": [
+        {
+            \"test_name\": \"hemoglobin\",
+            \"value\": \"14.2\",
+            \"unit\": \"g/dL\",
+            \"reference_range\": \"13.5 - 17.5 g/dL\"
         },
+        {
+      \"test_name\": \"rbc count\",
+      \"value\": \"5.2\",
+      \"unit\": \"million/cu mm\",
+      \"reference_range\": \"4.1-5.1\"
+    },
+        {
+            \"test_name\": \"glucose\",
+            \"value\": \"65\",
+            \"unit\": \"mg/dL\",
+            \"reference_range\": \"70 - 110 mg/dL\"
+        }
+    ],
+\"reds\":{
+    \"low\":[\"glucose\"],
+    \"high\":[\"rbc count\"]
+}
+}"""),
+            ],
+        ),
+        types.Content(
+            role="model",
+            parts=[
+                types.Part.from_text(text="""{
+  \"lab_tests\": [
+    {
+      \"reference_range\": \"12.0 - 17.0\",
+      \"test_name\": \"haemoglobin\",
+      \"unit\": \"gms/dl\",
+      \"value\": \"14\"
+    },
+    {
+      \"reference_range\": \"4.1-5.1\",
+      \"test_name\": \"rbc count\",
+      \"unit\": \"mill/cu mm\",
+      \"value\": \"4.4\"
+    },
+    {
+      \"reference_range\": \"32.0 - 47.0\",
+      \"test_name\": \"haematocrit (pcv)\",
+      \"unit\": \"%\",
+      \"value\": \"30\"
+    },
+    {
+      \"reference_range\": \"76.0 - 100.0\",
+      \"test_name\": \"mcv\",
+      \"unit\": \"fl\",
+      \"value\": \"78\"
+    },
+    {
+      \"reference_range\": \"26.0-32.0\",
+      \"test_name\": \"mch\",
+      \"unit\": \"pg\",
+      \"value\": \"32.46\"
+    },
+    {
+      \"reference_range\": \"31.5-34.5\",
+      \"test_name\": \"mchc\",
+      \"unit\": \"%\",
+      \"value\": \"32.8\"
+    },
+    {
+      \"reference_range\": \"11.6-15.0\",
+      \"test_name\": \"rdw\",
+      \"unit\": \"%\",
+      \"value\": \"13.9\"
+    },
+    {
+      \"reference_range\": \"6.8- 12.6\",
+      \"test_name\": \"mpv\",
+      \"unit\": \"fn\",
+      \"value\": \"11.2\"
+    },
+    {
+      \"reference_range\": \"4000 - 11000\",
+      \"test_name\": \"wbc count\",
+      \"unit\": \"/cu mm\",
+      \"value\": \"4567\"
+    },
+    {
+      \"reference_range\": \"40-70\",
+      \"test_name\": \"neutrophils\",
+      \"unit\": \"%\",
+      \"value\": \"56\"
+    },
+    {
+      \"reference_range\": \"20.0- 45.0\",
+      \"test_name\": \"lymphocytes\",
+      \"unit\": \"%\",
+      \"value\": \"20\"
+    },
+    {
+      \"reference_range\": \"0-6\",
+      \"test_name\": \"eosinophils\",
+      \"unit\": \"%\",
+      \"value\": \"4\"
+    },
+    {
+      \"reference_range\": \"2-10\",
+      \"test_name\": \"monocytes\",
+      \"unit\": \"%\",
+      \"value\": \"5\"
     }
+  ],
+  \"metadata\": {
+    \"age\": \"40\",
+    \"gender\": \"male\",
+    \"lab_name\": \"sanjeevan hospital\",
+    \"patient_name\": \"amar shaha\",
+    \"report_date\": \"09-jul-20\"
+  },
+  \"reds\": {
+    \"high\": [
+      \"mch\"
+    ],
+    \"low\": [
+      \"haematocrit (pcv)\"
+    ]
+  }
+}"""),
+            ],
+        ),
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text=extracted_text),
+            ],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        temperature=1,
+        top_p=0.95,
+        top_k=40,
+        max_output_tokens=8192,
+        response_mime_type="application/json",
+        response_schema=genai.types.Schema(
+            type = genai.types.Type.OBJECT,
+            enum = [],
+            required = ["metadata", "lab_tests", "reds"],
+            properties = {
+                "metadata": genai.types.Schema(
+                    type = genai.types.Type.OBJECT,
+                    enum = [],
+                    required = ["patient_name", "age", "gender", "lab_name", "report_date"],
+                    properties = {
+                        "patient_name": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "age": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "gender": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "lab_name": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "report_date": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                    },
+                ),
+                "lab_tests": genai.types.Schema(
+                    type = genai.types.Type.ARRAY,
+                    items = genai.types.Schema(
+                        type = genai.types.Type.OBJECT,
+                        enum = [],
+                        required = ["test_name", "value", "unit", "reference_range"],
+                        properties = {
+                            "test_name": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "value": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "unit": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "reference_range": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        },
+                    ),
+                ),
+                "reds": genai.types.Schema(
+                    type = genai.types.Type.OBJECT,
+                    enum = [],
+                    required = ["low", "high"],
+                    properties = {
+                        "low": genai.types.Schema(
+                            type = genai.types.Type.ARRAY,
+                            items = genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        ),
+                        "high": genai.types.Schema(
+                            type = genai.types.Type.ARRAY,
+                            items = genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        ),
+                    },
+                ),
+            },
+        ),
+        system_instruction=[
+            types.Part.from_text(text="""Always return the output as JSON only"""),
+        ],
+    )
+    # for chunk in client.models.generate_content_stream(
+    #     model=model,
+    #     contents=contents,
+    #     config=generate_content_config,
+    # ):
+    #     print(chunk.text, end="")
+    response = client.models.generate_content(
+        model=model,
+        contents=contents,
+        config=generate_content_config,
+    )
+    json_response = response.text  # The API should return JSON text
+    parsed_json = json.loads(json_response)  # Convert JSON string to Python dictionary
+    return parsed_json
 def showdata(lab_tests):
     df = pd.DataFrame(lab_tests)
 # Gradio interface function
 def process_pdf(pdf):
     text = read_pdf(pdf.name)  # Extract text from PDF
+    output = generate(text)  # Generate structured JSON
+    labtests=pd.DataFrame(output["lab_tests"])
     metadata = json_data["metadata"]
+    reds=pd.DataFrame(output["reds"])
     metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
                    f"**Age:** {metadata['age']}\n\n" \
                    f"**Gender:** {metadata['gender']}\n\n" \
                    f"**Lab Name:** {metadata['lab_name']}\n\n" \
                    f"**Report Date:** {metadata['report_date']}"
+    return metadata_str, labtests, output
 # Define Gradio interface
 with gr.Blocks() as demo:
     submit_btn.click(process_pdf, inputs=[pdf_input], outputs=[metadata_output, lab_test_output])
+# Add API access but only expose JSON
+demo.api(process_pdf, inputs=[gr.File(type="file")], outputs=gr.JSON(), route="/process")
 # Launch the app
 demo.launch()