Prashasst commited on
Commit
42e4a8b
·
verified ·
1 Parent(s): 6977359

Update entity_recognition.py

Browse files
Files changed (1) hide show
  1. entity_recognition.py +210 -180
entity_recognition.py CHANGED
@@ -7,166 +7,211 @@ from google.genai import types
7
 
8
 
9
  def process_text(extracted_text):
10
- """Lab Test and metadata entity recognition using gemini flash"""
11
- ''' Return type: JSON '''
12
- print("Performing Named Entity Recognition...")
13
-
14
  client = genai.Client(
15
  api_key=google_api,
16
  )
17
-
18
  model = "gemini-2.0-flash"
19
  contents = [
20
  types.Content(
21
  role="user",
22
  parts=[
23
- types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
24
- There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
25
- Please correct the errors and extract both metadata and structured lab test data.
26
- ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
27
- AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
28
- Return the output in structured JSON format with all the information in lowercase to standardization.
29
- And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
30
 
 
 
 
31
 
32
- Extracted Text:
33
- Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
 
34
 
35
- Expected JSON format:
 
 
 
 
 
 
36
  {
37
  \"metadata\": {
38
- \"patient_name\": \"Prasahsst Pawar\",
39
- \"age\": \"20\",
40
- \"gender\": \"Male\",
41
- \"lab_name\": \"XYZ Diagnostics\",
42
- \"report_date\": \"05-03-2025\"
43
- },
44
- \"lab_tests\": [
45
- {
46
- \"test_name\": \"hemoglobin\",
47
- \"value\": \"14.2\",
48
- \"unit\": \"g/dL\",
49
- \"reference_range\": \"13.5 - 17.5 g/dL\"
50
- },
51
- {
52
- \"test_name\": \"rbc count\",
53
- \"value\": \"5.2\",
54
- \"unit\": \"million/cu mm\",
55
- \"reference_range\": \"4.1-5.1\"
56
  },
 
57
  {
58
- \"test_name\": \"glucose\",
59
- \"value\": \"65\",
60
- \"unit\": \"mg/dL\",
61
- \"reference_range\": \"70 - 110 mg/dL\"
 
 
 
 
 
 
62
  }
63
- ],
64
- \"reds\":{
65
- \"low\":[\"glucose\"],
66
- \"high\":[\"rbc count\"]
67
  }
68
- }"""),
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  ],
70
  ),
71
  types.Content(
72
  role="model",
73
  parts=[
74
  types.Part.from_text(text="""{
75
- \"lab_tests\": [
76
- {
77
- \"reference_range\": \"12.0 - 17.0\",
78
- \"test_name\": \"haemoglobin\",
79
- \"unit\": \"gms/dl\",
80
- \"value\": \"14\"
81
- },
82
- {
83
- \"reference_range\": \"4.1-5.1\",
84
- \"test_name\": \"rbc count\",
85
- \"unit\": \"mill/cu mm\",
86
- \"value\": \"4.4\"
87
- },
88
- {
89
- \"reference_range\": \"32.0 - 47.0\",
90
- \"test_name\": \"haematocrit (pcv)\",
91
- \"unit\": \"%\",
92
- \"value\": \"30\"
93
- },
94
- {
95
- \"reference_range\": \"76.0 - 100.0\",
96
- \"test_name\": \"mcv\",
97
- \"unit\": \"fl\",
98
- \"value\": \"78\"
99
- },
100
- {
101
- \"reference_range\": \"26.0-32.0\",
102
- \"test_name\": \"mch\",
103
- \"unit\": \"pg\",
104
- \"value\": \"32.46\"
105
- },
106
- {
107
- \"reference_range\": \"31.5-34.5\",
108
- \"test_name\": \"mchc\",
109
- \"unit\": \"%\",
110
- \"value\": \"32.8\"
111
- },
112
- {
113
- \"reference_range\": \"11.6-15.0\",
114
- \"test_name\": \"rdw\",
115
- \"unit\": \"%\",
116
- \"value\": \"13.9\"
117
- },
118
- {
119
- \"reference_range\": \"6.8- 12.6\",
120
- \"test_name\": \"mpv\",
121
- \"unit\": \"fn\",
122
- \"value\": \"11.2\"
123
- },
124
- {
125
- \"reference_range\": \"4000 - 11000\",
126
- \"test_name\": \"wbc count\",
127
- \"unit\": \"/cu mm\",
128
- \"value\": \"4567\"
129
- },
130
- {
131
- \"reference_range\": \"40-70\",
132
- \"test_name\": \"neutrophils\",
133
- \"unit\": \"%\",
134
- \"value\": \"56\"
135
- },
136
- {
137
- \"reference_range\": \"20.0- 45.0\",
138
- \"test_name\": \"lymphocytes\",
139
- \"unit\": \"%\",
140
- \"value\": \"20\"
141
- },
142
- {
143
- \"reference_range\": \"0-6\",
144
- \"test_name\": \"eosinophils\",
145
- \"unit\": \"%\",
146
- \"value\": \"4\"
147
  },
148
- {
149
- \"reference_range\": \"2-10\",
150
- \"test_name\": \"monocytes\",
151
- \"unit\": \"%\",
152
- \"value\": \"5\"
153
- }
154
- ],
155
- \"metadata\": {
156
- \"age\": \"40\",
157
- \"gender\": \"male\",
158
- \"lab_name\": \"sanjeevan hospital\",
159
- \"patient_name\": \"amar shaha\",
160
- \"report_date\": \"09-jul-20\"
161
- },
162
- \"reds\": {
163
- \"high\": [
164
- \"mch\"
165
- ],
166
- \"low\": [
167
- \"haematocrit (pcv)\"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  ]
169
- }
170
  }"""),
171
  ],
172
  ),
@@ -185,12 +230,10 @@ Expected JSON format:
185
  response_mime_type="application/json",
186
  response_schema=genai.types.Schema(
187
  type = genai.types.Type.OBJECT,
188
- enum = [],
189
- required = ["metadata", "lab_tests", "reds"],
190
  properties = {
191
  "metadata": genai.types.Schema(
192
  type = genai.types.Type.OBJECT,
193
- enum = [],
194
  required = ["patient_name", "age", "gender", "lab_name", "report_date"],
195
  properties = {
196
  "patient_name": genai.types.Schema(
@@ -201,6 +244,7 @@ Expected JSON format:
201
  ),
202
  "gender": genai.types.Schema(
203
  type = genai.types.Type.STRING,
 
204
  ),
205
  "lab_name": genai.types.Schema(
206
  type = genai.types.Type.STRING,
@@ -210,63 +254,49 @@ Expected JSON format:
210
  ),
211
  },
212
  ),
213
- "lab_tests": genai.types.Schema(
214
  type = genai.types.Type.ARRAY,
215
  items = genai.types.Schema(
216
  type = genai.types.Type.OBJECT,
217
- enum = [],
218
- required = ["test_name", "value", "unit", "reference_range"],
219
  properties = {
220
- "test_name": genai.types.Schema(
221
- type = genai.types.Type.STRING,
222
- ),
223
- "value": genai.types.Schema(
224
  type = genai.types.Type.STRING,
225
  ),
226
- "unit": genai.types.Schema(
227
- type = genai.types.Type.STRING,
228
- ),
229
- "reference_range": genai.types.Schema(
230
- type = genai.types.Type.STRING,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  ),
232
  },
233
  ),
234
  ),
235
- "reds": genai.types.Schema(
236
- type = genai.types.Type.OBJECT,
237
- enum = [],
238
- required = ["low", "high"],
239
- properties = {
240
- "low": genai.types.Schema(
241
- type = genai.types.Type.ARRAY,
242
- items = genai.types.Schema(
243
- type = genai.types.Type.STRING,
244
- ),
245
- ),
246
- "high": genai.types.Schema(
247
- type = genai.types.Type.ARRAY,
248
- items = genai.types.Schema(
249
- type = genai.types.Type.STRING,
250
- ),
251
- ),
252
- },
253
- ),
254
  },
255
  ),
256
  system_instruction=[
257
- types.Part.from_text(text="""Always return the output as JSON only"""),
258
  ],
259
  )
260
-
261
-
262
-
263
- # for chunk in client.models.generate_content_stream(
264
- # model=model,
265
- # contents=contents,
266
- # config=generate_content_config,
267
- # ):
268
- # print(chunk.text, end="")
269
-
270
  try:
271
  response = client.models.generate_content(
272
  model=model, contents=contents, config=generate_content_config
 
7
 
8
 
9
  def process_text(extracted_text):
 
 
 
 
10
  client = genai.Client(
11
  api_key=google_api,
12
  )
 
13
  model = "gemini-2.0-flash"
14
  contents = [
15
  types.Content(
16
  role="user",
17
  parts=[
18
+ types.Part.from_text(text="""**Instruction:**
19
+ You are an advanced AI model specializing in medical data extraction. Given an unstructured OCR-extracted text from a medical lab report, your task is to:
 
 
 
 
 
20
 
21
+ 1. **Correct Errors**
22
+ - Fix missing decimals, incorrect test names, and incorrect reference ranges.
23
+ - Ensure test values fall within valid medical reference ranges.
24
 
25
+ 2. **Extract and Structure Data**
26
+ - Extract **metadata** (patient details) and **lab report data** in structured JSON format.
27
+ - Maintain consistency in naming conventions and JSON structure.
28
 
29
+ 3. **Assign Status Labels**
30
+ - **GREEN**: Value is within the normal range.
31
+ - **AMBER**: Borderline or slightly out of range.
32
+ - **RED**: Critical or significantly out of range.
33
+
34
+ ### **JSON Output Format (Strictly Follow This Structure)**
35
+ ```json
36
  {
37
  \"metadata\": {
38
+ \"patient_name\": \"<Corrected Name>\",
39
+ \"age\": \"<Age>\",
40
+ \"gender\": \"<Male/Female>\",
41
+ \"lab_name\": \"<Lab Name>\",
42
+ \"report_date\": \"<DD-MM-YYYY>\"
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  },
44
+ \"report\": [
45
  {
46
+ \"test_type\": \"<HEMOGRAM / BIOCHEMISTRY / OTHER>\",
47
+ \"lab_tests\": [
48
+ {
49
+ \"test_name\": \"<Corrected Test Name>\",
50
+ \"value\": \"<Numerical Value>\",
51
+ \"unit\": \"<Unit>\",
52
+ \"reference_range\": \"<Lower Limit - Upper Limit Unit>\",
53
+ \"status\": \"<GREEN / AMBER / RED>\"
54
+ }
55
+ ]
56
  }
57
+ ]
 
 
 
58
  }
59
+ ```
60
+ ###EXTRACTED TEXT :
61
+ Dr. Onkar Test Sanjeevan Iospital MNES Mn) No:Tiz 12/4 Paud Racid Kothrud Fune - 4V102 Ph: 02025262+5,8983390126, Tlmins: 09.15 AM 0z.30 PMOS.30PM OY_OPAAPPOINTMENTS ONLY | Closed: Mondjy Fridwy Ftent UID: 67 Report No: UOOI8 Nane: AMAF SHAHA (Mle) DIc 02-lul-20 73e 40 years Sample CollectedAc HoqitLb Mddress; MG Rozd FUNE Simple Type/Quantly: Blood Ref. By Doctor Sumnple Collexulon DT: 2-Jul-20, 950AV Dc . Amlt Dcshmukh Tesl Fesult DT: 0)-Jul-zo. +53PI HEMOGRAM IvesTGATiUR RESULT UNM REF. MINGE HaLMOGiOMN 14 guts/a 120.170 KRC coint 4 nlfcunm 41 51 HAEMOTOCRIT (PcW) 30 9u 320.470 MCV 78 n 760.i00, Mch H 32 6 200.320 McHc 32 A 315.365 Rdw 13 9 ; 116.I50 MPv 112 6A . 126 WKC CNT 4567 IcuMIM ACOO Tiuoo DIFTERENTML CnT MeuTKOHILS 56 10 - 70 LyMPOCTTES 20 200 450 NONcytis 1 4.6 RASOPHILS 0.1 PETELETS 273+00 fcu mm I50oou -+Ouuou PEIIPHERIL SMEAIL EXAMINATION RBC Mopholegy Nonmocyulc Normaxchronulc PMTELETS Adequate RS For "'Jnsites Not Scen TEST DONE ON FULLY AUTOMATED) FWVE PART HAEMOTOLOGY ANALYSER XTIROO , SYSMEXHPENTRA XLR HORRIRIBA (DfierentaalWBC Cunt [s correlted microscoplelly) H High, L Loiv ) END OF REPORT 8 Rv Rc
62
+
63
+
64
+
65
+
66
+ ### **Key Constraints:**
67
+ - **DO NOT** include extra metadata fields beyond `patient_name`, `age`, `gender`, `lab_name`, and `report_date`.
68
+ - **DO NOT** include extra test fields beyond `test_name`, `value`, `unit`, `reference_range`, and `status`.
69
+ - **ENSURE** numerical values align with real-world medical reference ranges.
70
+ - **FORMAT OUTPUT** strictly in JSON, ensuring proper nesting.
71
+
72
+ """),
73
  ],
74
  ),
75
  types.Content(
76
  role="model",
77
  parts=[
78
  types.Part.from_text(text="""{
79
+ \"metadata\": {
80
+ \"patient_name\": \"Amar Shaha\",
81
+ \"age\": \"40 years\",
82
+ \"gender\": \"Male\",
83
+ \"lab_name\": \"Sanjeevan Hospital\",
84
+ \"report_date\": \"02-07-2023\"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  },
86
+ \"report\": [
87
+ {
88
+ \"lab_tests\": [
89
+ {
90
+ \"reference_range\": \"12 - 17 g/dL\",
91
+ \"status\": \"GREEN\",
92
+ \"test_name\": \"Hemoglobin\",
93
+ \"unit\": \"g/dL\",
94
+ \"value\": \"14.1\"
95
+ },
96
+ {
97
+ \"reference_range\": \"4 - 6 million/cu.mm\",
98
+ \"status\": \"GREEN\",
99
+ \"test_name\": \"RBC Count\",
100
+ \"unit\": \"million/cu.mm\",
101
+ \"value\": \"4.8\"
102
+ },
103
+ {
104
+ \"reference_range\": \"37 - 52 %\",
105
+ \"status\": \"AMBER\",
106
+ \"test_name\": \"Hematocrit (PCV)\",
107
+ \"unit\": \"%\",
108
+ \"value\": \"36.9\"
109
+ },
110
+ {
111
+ \"reference_range\": \"76 - 100 fL\",
112
+ \"status\": \"GREEN\",
113
+ \"test_name\": \"MCV\",
114
+ \"unit\": \"fL\",
115
+ \"value\": \"78.1\"
116
+ },
117
+ {
118
+ \"reference_range\": \"27 - 32 pg\",
119
+ \"status\": \"RED\",
120
+ \"test_name\": \"MCH\",
121
+ \"unit\": \"pg\",
122
+ \"value\": \"32.6\"
123
+ },
124
+ {
125
+ \"reference_range\": \"31.5 - 36.5 g/dL\",
126
+ \"status\": \"GREEN\",
127
+ \"test_name\": \"MCHC\",
128
+ \"unit\": \"g/dL\",
129
+ \"value\": \"32.8\"
130
+ },
131
+ {
132
+ \"reference_range\": \"11.6 - 15\",
133
+ \"status\": \"GREEN\",
134
+ \"test_name\": \"RDW\",
135
+ \"unit\": \"%\",
136
+ \"value\": \"13.9\"
137
+ },
138
+ {
139
+ \"reference_range\": \"7.2 - 12.9 fL\",
140
+ \"status\": \"AMBER\",
141
+ \"test_name\": \"MPV\",
142
+ \"unit\": \"fL\",
143
+ \"value\": \"11.2\"
144
+ },
145
+ {
146
+ \"reference_range\": \"4000 - 11000 /cu.mm\",
147
+ \"status\": \"GREEN\",
148
+ \"test_name\": \"WBC Count\",
149
+ \"unit\": \"/cu.mm\",
150
+ \"value\": \"4567\"
151
+ },
152
+ {
153
+ \"reference_range\": \"150000 - 450000 /cu.mm\",
154
+ \"status\": \"GREEN\",
155
+ \"test_name\": \"Platelets\",
156
+ \"unit\": \"/cu.mm\",
157
+ \"value\": \"273000\"
158
+ }
159
+ ],
160
+ \"test_type\": \"HEMOGRAM\"
161
+ },
162
+ {
163
+ \"lab_tests\": [
164
+ {
165
+ \"reference_range\": \"40 - 70 %\",
166
+ \"status\": \"GREEN\",
167
+ \"test_name\": \"Neutrophils\",
168
+ \"unit\": \"%\",
169
+ \"value\": \"56.1\"
170
+ },
171
+ {
172
+ \"reference_range\": \"20 - 45 %\",
173
+ \"status\": \"GREEN\",
174
+ \"test_name\": \"Lymphocytes\",
175
+ \"unit\": \"%\",
176
+ \"value\": \"20.2\"
177
+ },
178
+ {
179
+ \"reference_range\": \"2 - 10 %\",
180
+ \"status\": \"RED\",
181
+ \"test_name\": \"Monocytes\",
182
+ \"unit\": \"%\",
183
+ \"value\": \"14.6\"
184
+ },
185
+ {
186
+ \"reference_range\": \"0 - 1 %\",
187
+ \"status\": \"GREEN\",
188
+ \"test_name\": \"Basophils\",
189
+ \"unit\": \"%\",
190
+ \"value\": \"0.1\"
191
+ }
192
+ ],
193
+ \"test_type\": \"DIFFERENTIAL COUNT\"
194
+ },
195
+ {
196
+ \"lab_tests\": [
197
+ {
198
+ \"reference_range\": \"Normal\",
199
+ \"status\": \"GREEN\",
200
+ \"test_name\": \"RBC Morphology\",
201
+ \"unit\": \"N/A\",
202
+ \"value\": \"Normocytic Normochromic\"
203
+ },
204
+ {
205
+ \"reference_range\": \"Normal\",
206
+ \"status\": \"GREEN\",
207
+ \"test_name\": \"Platelet Morphology\",
208
+ \"unit\": \"N/A\",
209
+ \"value\": \"Adequate\"
210
+ }
211
+ ],
212
+ \"test_type\": \"PERIPHERAL SMEAR EXAMINATION\"
213
+ }
214
  ]
 
215
  }"""),
216
  ],
217
  ),
 
230
  response_mime_type="application/json",
231
  response_schema=genai.types.Schema(
232
  type = genai.types.Type.OBJECT,
233
+ required = ["metadata", "report"],
 
234
  properties = {
235
  "metadata": genai.types.Schema(
236
  type = genai.types.Type.OBJECT,
 
237
  required = ["patient_name", "age", "gender", "lab_name", "report_date"],
238
  properties = {
239
  "patient_name": genai.types.Schema(
 
244
  ),
245
  "gender": genai.types.Schema(
246
  type = genai.types.Type.STRING,
247
+ enum = ["Male", "Female", "Other"],
248
  ),
249
  "lab_name": genai.types.Schema(
250
  type = genai.types.Type.STRING,
 
254
  ),
255
  },
256
  ),
257
+ "report": genai.types.Schema(
258
  type = genai.types.Type.ARRAY,
259
  items = genai.types.Schema(
260
  type = genai.types.Type.OBJECT,
261
+ required = ["test_type", "lab_tests"],
 
262
  properties = {
263
+ "test_type": genai.types.Schema(
 
 
 
264
  type = genai.types.Type.STRING,
265
  ),
266
+ "lab_tests": genai.types.Schema(
267
+ type = genai.types.Type.ARRAY,
268
+ items = genai.types.Schema(
269
+ type = genai.types.Type.OBJECT,
270
+ required = ["test_name", "value", "unit", "reference_range", "status"],
271
+ properties = {
272
+ "test_name": genai.types.Schema(
273
+ type = genai.types.Type.STRING,
274
+ ),
275
+ "value": genai.types.Schema(
276
+ type = genai.types.Type.STRING,
277
+ ),
278
+ "unit": genai.types.Schema(
279
+ type = genai.types.Type.STRING,
280
+ ),
281
+ "reference_range": genai.types.Schema(
282
+ type = genai.types.Type.STRING,
283
+ ),
284
+ "status": genai.types.Schema(
285
+ type = genai.types.Type.STRING,
286
+ enum = ["GREEN", "AMBER", "RED"],
287
+ ),
288
+ },
289
+ ),
290
  ),
291
  },
292
  ),
293
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  },
295
  ),
296
  system_instruction=[
297
+ types.Part.from_text(text="""You are an advanced medical data extraction AI designed to process unstructured OCR text from medical lab reports. Your task is to correct errors in test names, values, and reference ranges while ensuring all values align with real-world medical standards. Extract metadata and lab test data in a structured JSON format, strictly following the predefined schema. Assign status labels (GREEN, AMBER, RED) based on whether test values fall within, near, or outside the reference range. Do not add extra fields or modify reference ranges unless corrections are needed for accuracy. Ensure consistent formatting, valid numerical values, and a properly structured JSON output without any deviations."""),
298
  ],
299
  )
 
 
 
 
 
 
 
 
 
 
300
  try:
301
  response = client.models.generate_content(
302
  model=model, contents=contents, config=generate_content_config