Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 17

Commit

42e4a8b

verified ·

1 Parent(s): 6977359

Update entity_recognition.py

Browse files

Files changed (1) hide show

entity_recognition.py +210 -180

entity_recognition.py CHANGED Viewed

@@ -7,166 +7,211 @@ from google.genai import types
 def process_text(extracted_text):
-    """Lab Test and metadata entity recognition using gemini flash"""
-    ''' Return type: JSON '''
-    print("Performing Named Entity Recognition...")
     client = genai.Client(
         api_key=google_api,
     )
     model = "gemini-2.0-flash"
     contents = [
         types.Content(
             role="user",
             parts=[
-                types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
-There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
-Please correct the errors and extract both metadata and structured lab test data.
-ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
-AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
-Return the output in structured JSON format with all the information in lowercase to standardization.
-And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
-Extracted Text:
-Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n     \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
-Expected JSON format:
 {
     \"metadata\": {
-        \"patient_name\": \"Prasahsst Pawar\",
-        \"age\": \"20\",
-        \"gender\": \"Male\",
-        \"lab_name\": \"XYZ Diagnostics\",
-        \"report_date\": \"05-03-2025\"
-    },
-    \"lab_tests\": [
-        {
-            \"test_name\": \"hemoglobin\",
-            \"value\": \"14.2\",
-            \"unit\": \"g/dL\",
-            \"reference_range\": \"13.5 - 17.5 g/dL\"
-        },
-        {
-      \"test_name\": \"rbc count\",
-      \"value\": \"5.2\",
-      \"unit\": \"million/cu mm\",
-      \"reference_range\": \"4.1-5.1\"
     },
         {
-            \"test_name\": \"glucose\",
-            \"value\": \"65\",
-            \"unit\": \"mg/dL\",
-            \"reference_range\": \"70 - 110 mg/dL\"
         }
-    ],
-\"reds\":{
-    \"low\":[\"glucose\"],
-    \"high\":[\"rbc count\"]
 }
-}"""),
             ],
         ),
         types.Content(
             role="model",
             parts=[
                 types.Part.from_text(text="""{
-  \"lab_tests\": [
-    {
-      \"reference_range\": \"12.0 - 17.0\",
-      \"test_name\": \"haemoglobin\",
-      \"unit\": \"gms/dl\",
-      \"value\": \"14\"
-    },
-    {
-      \"reference_range\": \"4.1-5.1\",
-      \"test_name\": \"rbc count\",
-      \"unit\": \"mill/cu mm\",
-      \"value\": \"4.4\"
-    },
-    {
-      \"reference_range\": \"32.0 - 47.0\",
-      \"test_name\": \"haematocrit (pcv)\",
-      \"unit\": \"%\",
-      \"value\": \"30\"
-    },
-    {
-      \"reference_range\": \"76.0 - 100.0\",
-      \"test_name\": \"mcv\",
-      \"unit\": \"fl\",
-      \"value\": \"78\"
-    },
-    {
-      \"reference_range\": \"26.0-32.0\",
-      \"test_name\": \"mch\",
-      \"unit\": \"pg\",
-      \"value\": \"32.46\"
-    },
-    {
-      \"reference_range\": \"31.5-34.5\",
-      \"test_name\": \"mchc\",
-      \"unit\": \"%\",
-      \"value\": \"32.8\"
-    },
-    {
-      \"reference_range\": \"11.6-15.0\",
-      \"test_name\": \"rdw\",
-      \"unit\": \"%\",
-      \"value\": \"13.9\"
-    },
-    {
-      \"reference_range\": \"6.8- 12.6\",
-      \"test_name\": \"mpv\",
-      \"unit\": \"fn\",
-      \"value\": \"11.2\"
-    },
-    {
-      \"reference_range\": \"4000 - 11000\",
-      \"test_name\": \"wbc count\",
-      \"unit\": \"/cu mm\",
-      \"value\": \"4567\"
-    },
-    {
-      \"reference_range\": \"40-70\",
-      \"test_name\": \"neutrophils\",
-      \"unit\": \"%\",
-      \"value\": \"56\"
-    },
-    {
-      \"reference_range\": \"20.0- 45.0\",
-      \"test_name\": \"lymphocytes\",
-      \"unit\": \"%\",
-      \"value\": \"20\"
-    },
-    {
-      \"reference_range\": \"0-6\",
-      \"test_name\": \"eosinophils\",
-      \"unit\": \"%\",
-      \"value\": \"4\"
     },
-    {
-      \"reference_range\": \"2-10\",
-      \"test_name\": \"monocytes\",
-      \"unit\": \"%\",
-      \"value\": \"5\"
-    }
-  ],
-  \"metadata\": {
-    \"age\": \"40\",
-    \"gender\": \"male\",
-    \"lab_name\": \"sanjeevan hospital\",
-    \"patient_name\": \"amar shaha\",
-    \"report_date\": \"09-jul-20\"
-  },
-  \"reds\": {
-    \"high\": [
-      \"mch\"
-    ],
-    \"low\": [
-      \"haematocrit (pcv)\"
     ]
-  }
 }"""),
             ],
         ),
@@ -185,12 +230,10 @@ Expected JSON format:
         response_mime_type="application/json",
         response_schema=genai.types.Schema(
             type = genai.types.Type.OBJECT,
-            enum = [],
-            required = ["metadata", "lab_tests", "reds"],
             properties = {
                 "metadata": genai.types.Schema(
                     type = genai.types.Type.OBJECT,
-                    enum = [],
                     required = ["patient_name", "age", "gender", "lab_name", "report_date"],
                     properties = {
                         "patient_name": genai.types.Schema(
@@ -201,6 +244,7 @@ Expected JSON format:
                         ),
                         "gender": genai.types.Schema(
                             type = genai.types.Type.STRING,
                         ),
                         "lab_name": genai.types.Schema(
                             type = genai.types.Type.STRING,
@@ -210,63 +254,49 @@ Expected JSON format:
                         ),
                     },
                 ),
-                "lab_tests": genai.types.Schema(
                     type = genai.types.Type.ARRAY,
                     items = genai.types.Schema(
                         type = genai.types.Type.OBJECT,
-                        enum = [],
-                        required = ["test_name", "value", "unit", "reference_range"],
                         properties = {
-                            "test_name": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                            "value": genai.types.Schema(
                                 type = genai.types.Type.STRING,
                             ),
-                            "unit": genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                            "reference_range": genai.types.Schema(
-                                type = genai.types.Type.STRING,
                             ),
                         },
                     ),
                 ),
-                "reds": genai.types.Schema(
-                    type = genai.types.Type.OBJECT,
-                    enum = [],
-                    required = ["low", "high"],
-                    properties = {
-                        "low": genai.types.Schema(
-                            type = genai.types.Type.ARRAY,
-                            items = genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                        ),
-                        "high": genai.types.Schema(
-                            type = genai.types.Type.ARRAY,
-                            items = genai.types.Schema(
-                                type = genai.types.Type.STRING,
-                            ),
-                        ),
-                    },
-                ),
             },
         ),
         system_instruction=[
-            types.Part.from_text(text="""Always return the output as JSON only"""),
         ],
     )
-    # for chunk in client.models.generate_content_stream(
-    #     model=model,
-    #     contents=contents,
-    #     config=generate_content_config,
-    # ):
-    #     print(chunk.text, end="")
     try:
         response = client.models.generate_content(
             model=model, contents=contents, config=generate_content_config

 def process_text(extracted_text):
     client = genai.Client(
         api_key=google_api,
     )
     model = "gemini-2.0-flash"
     contents = [
         types.Content(
             role="user",
             parts=[
+                types.Part.from_text(text="""**Instruction:**
+You are an advanced AI model specializing in medical data extraction. Given an unstructured OCR-extracted text from a medical lab report, your task is to:
+1. **Correct Errors**
+   - Fix missing decimals, incorrect test names, and incorrect reference ranges.
+   - Ensure test values fall within valid medical reference ranges.
+2. **Extract and Structure Data**
+   - Extract **metadata** (patient details) and **lab report data** in structured JSON format.
+   - Maintain consistency in naming conventions and JSON structure.
+3. **Assign Status Labels**
+   - **GREEN**: Value is within the normal range.
+   - **AMBER**: Borderline or slightly out of range.
+   - **RED**: Critical or significantly out of range.
+### **JSON Output Format (Strictly Follow This Structure)**
+```json
 {
     \"metadata\": {
+        \"patient_name\": \"<Corrected Name>\",
+        \"age\": \"<Age>\",
+        \"gender\": \"<Male/Female>\",
+        \"lab_name\": \"<Lab Name>\",
+        \"report_date\": \"<DD-MM-YYYY>\"
     },
+    \"report\": [
         {
+            \"test_type\": \"<HEMOGRAM / BIOCHEMISTRY / OTHER>\",
+            \"lab_tests\": [
+                {
+                    \"test_name\": \"<Corrected Test Name>\",
+                    \"value\": \"<Numerical Value>\",
+                    \"unit\": \"<Unit>\",
+                    \"reference_range\": \"<Lower Limit - Upper Limit Unit>\",
+                    \"status\": \"<GREEN / AMBER / RED>\"
+                }
+            ]
         }
+    ]
 }
+```
+###EXTRACTED TEXT :
+Dr. Onkar Test Sanjeevan Iospital MNES Mn) No:Tiz 12/4 Paud Racid  Kothrud Fune - 4V102 Ph: 02025262+5,8983390126, Tlmins: 09.15 AM 0z.30 PMOS.30PM OY_OPAAPPOINTMENTS ONLY | Closed: Mondjy Fridwy Ftent UID: 67 Report No: UOOI8 Nane: AMAF SHAHA (Mle) DIc 02-lul-20 73e 40 years Sample CollectedAc HoqitLb Mddress; MG Rozd FUNE Simple Type/Quantly: Blood Ref. By Doctor Sumnple Collexulon DT: 2-Jul-20, 950AV Dc . Amlt Dcshmukh Tesl Fesult DT: 0)-Jul-zo. +53PI HEMOGRAM IvesTGATiUR RESULT UNM REF. MINGE HaLMOGiOMN 14 guts/a 120.170 KRC coint 4 nlfcunm 41 51 HAEMOTOCRIT (PcW) 30 9u 320.470 MCV 78 n 760.i00, Mch H 32 6 200.320 McHc 32 A 315.365 Rdw 13 9 ; 116.I50 MPv 112 6A . 126 WKC CNT 4567 IcuMIM ACOO Tiuoo DIFTERENTML CnT MeuTKOHILS 56 10 - 70 LyMPOCTTES 20 200 450 NONcytis 1 4.6 RASOPHILS 0.1 PETELETS 273+00 fcu mm I50oou -+Ouuou PEIIPHERIL SMEAIL EXAMINATION RBC Mopholegy Nonmocyulc Normaxchronulc PMTELETS Adequate RS For "'Jnsites Not Scen TEST DONE ON FULLY AUTOMATED) FWVE PART HAEMOTOLOGY ANALYSER XTIROO , SYSMEXHPENTRA XLR HORRIRIBA (DfierentaalWBC Cunt [s correlted microscoplelly) H High, L Loiv ) END OF REPORT 8 Rv Rc
+### **Key Constraints:**
+- **DO NOT** include extra metadata fields beyond `patient_name`, `age`, `gender`, `lab_name`, and `report_date`.
+- **DO NOT** include extra test fields beyond `test_name`, `value`, `unit`, `reference_range`, and `status`.
+- **ENSURE** numerical values align with real-world medical reference ranges.
+- **FORMAT OUTPUT** strictly in JSON, ensuring proper nesting.
+"""),
             ],
         ),
         types.Content(
             role="model",
             parts=[
                 types.Part.from_text(text="""{
+    \"metadata\": {
+        \"patient_name\": \"Amar Shaha\",
+        \"age\": \"40 years\",
+        \"gender\": \"Male\",
+        \"lab_name\": \"Sanjeevan Hospital\",
+        \"report_date\": \"02-07-2023\"
     },
+    \"report\": [
+        {
+            \"lab_tests\": [
+                {
+                    \"reference_range\": \"12 - 17 g/dL\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Hemoglobin\",
+                    \"unit\": \"g/dL\",
+                    \"value\": \"14.1\"
+                },
+                {
+                    \"reference_range\": \"4 - 6 million/cu.mm\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"RBC Count\",
+                    \"unit\": \"million/cu.mm\",
+                    \"value\": \"4.8\"
+                },
+                {
+                    \"reference_range\": \"37 - 52 %\",
+                    \"status\": \"AMBER\",
+                    \"test_name\": \"Hematocrit (PCV)\",
+                    \"unit\": \"%\",
+                    \"value\": \"36.9\"
+                },
+                {
+                    \"reference_range\": \"76 - 100 fL\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"MCV\",
+                    \"unit\": \"fL\",
+                    \"value\": \"78.1\"
+                },
+                {
+                    \"reference_range\": \"27 - 32 pg\",
+                    \"status\": \"RED\",
+                    \"test_name\": \"MCH\",
+                    \"unit\": \"pg\",
+                    \"value\": \"32.6\"
+                },
+                {
+                    \"reference_range\": \"31.5 - 36.5 g/dL\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"MCHC\",
+                    \"unit\": \"g/dL\",
+                    \"value\": \"32.8\"
+                },
+                {
+                    \"reference_range\": \"11.6 - 15\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"RDW\",
+                    \"unit\": \"%\",
+                    \"value\": \"13.9\"
+                },
+                {
+                    \"reference_range\": \"7.2 - 12.9 fL\",
+                    \"status\": \"AMBER\",
+                    \"test_name\": \"MPV\",
+                    \"unit\": \"fL\",
+                    \"value\": \"11.2\"
+                },
+                {
+                    \"reference_range\": \"4000 - 11000 /cu.mm\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"WBC Count\",
+                    \"unit\": \"/cu.mm\",
+                    \"value\": \"4567\"
+                },
+                {
+                    \"reference_range\": \"150000 - 450000 /cu.mm\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Platelets\",
+                    \"unit\": \"/cu.mm\",
+                    \"value\": \"273000\"
+                }
+            ],
+            \"test_type\": \"HEMOGRAM\"
+        },
+        {
+            \"lab_tests\": [
+                {
+                    \"reference_range\": \"40 - 70 %\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Neutrophils\",
+                    \"unit\": \"%\",
+                    \"value\": \"56.1\"
+                },
+                {
+                    \"reference_range\": \"20 - 45 %\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Lymphocytes\",
+                    \"unit\": \"%\",
+                    \"value\": \"20.2\"
+                },
+                {
+                    \"reference_range\": \"2 - 10 %\",
+                    \"status\": \"RED\",
+                    \"test_name\": \"Monocytes\",
+                    \"unit\": \"%\",
+                    \"value\": \"14.6\"
+                },
+                {
+                    \"reference_range\": \"0 - 1 %\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Basophils\",
+                    \"unit\": \"%\",
+                    \"value\": \"0.1\"
+                }
+            ],
+            \"test_type\": \"DIFFERENTIAL COUNT\"
+        },
+        {
+            \"lab_tests\": [
+                {
+                    \"reference_range\": \"Normal\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"RBC Morphology\",
+                    \"unit\": \"N/A\",
+                    \"value\": \"Normocytic Normochromic\"
+                },
+                {
+                    \"reference_range\": \"Normal\",
+                    \"status\": \"GREEN\",
+                    \"test_name\": \"Platelet Morphology\",
+                    \"unit\": \"N/A\",
+                    \"value\": \"Adequate\"
+                }
+            ],
+            \"test_type\": \"PERIPHERAL SMEAR EXAMINATION\"
+        }
     ]
 }"""),
             ],
         ),
         response_mime_type="application/json",
         response_schema=genai.types.Schema(
             type = genai.types.Type.OBJECT,
+            required = ["metadata", "report"],
             properties = {
                 "metadata": genai.types.Schema(
                     type = genai.types.Type.OBJECT,
                     required = ["patient_name", "age", "gender", "lab_name", "report_date"],
                     properties = {
                         "patient_name": genai.types.Schema(
                         ),
                         "gender": genai.types.Schema(
                             type = genai.types.Type.STRING,
+                            enum = ["Male", "Female", "Other"],
                         ),
                         "lab_name": genai.types.Schema(
                             type = genai.types.Type.STRING,
                         ),
                     },
                 ),
+                "report": genai.types.Schema(
                     type = genai.types.Type.ARRAY,
                     items = genai.types.Schema(
                         type = genai.types.Type.OBJECT,
+                        required = ["test_type", "lab_tests"],
                         properties = {
+                            "test_type": genai.types.Schema(
                                 type = genai.types.Type.STRING,
                             ),
+                            "lab_tests": genai.types.Schema(
+                                type = genai.types.Type.ARRAY,
+                                items = genai.types.Schema(
+                                    type = genai.types.Type.OBJECT,
+                                    required = ["test_name", "value", "unit", "reference_range", "status"],
+                                    properties = {
+                                        "test_name": genai.types.Schema(
+                                            type = genai.types.Type.STRING,
+                                        ),
+                                        "value": genai.types.Schema(
+                                            type = genai.types.Type.STRING,
+                                        ),
+                                        "unit": genai.types.Schema(
+                                            type = genai.types.Type.STRING,
+                                        ),
+                                        "reference_range": genai.types.Schema(
+                                            type = genai.types.Type.STRING,
+                                        ),
+                                        "status": genai.types.Schema(
+                                            type = genai.types.Type.STRING,
+                                            enum = ["GREEN", "AMBER", "RED"],
+                                        ),
+                                    },
+                                ),
                             ),
                         },
                     ),
                 ),
             },
         ),
         system_instruction=[
+            types.Part.from_text(text="""You are an advanced medical data extraction AI designed to process unstructured OCR text from medical lab reports. Your task is to correct errors in test names, values, and reference ranges while ensuring all values align with real-world medical standards. Extract metadata and lab test data in a structured JSON format, strictly following the predefined schema. Assign status labels (GREEN, AMBER, RED) based on whether test values fall within, near, or outside the reference range. Do not add extra fields or modify reference ranges unless corrections are needed for accuracy. Ensure consistent formatting, valid numerical values, and a properly structured JSON output without any deviations."""),
         ],
     )
     try:
         response = client.models.generate_content(
             model=model, contents=contents, config=generate_content_config