Prashasst commited on
Commit
371da07
·
verified ·
1 Parent(s): 970417f

Create entity_recognition.py

Browse files
Files changed (1) hide show
  1. entity_recognition.py +278 -0
entity_recognition.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from config import google_api
3
+
4
+
5
+ def process_text(extracted_text):
6
+ """Lab Test and metadata entity recognition using gemini flash"""
7
+ ''' Return type: JSON '''
8
+ print("Performing Named Entity Recognition...")
9
+
10
+ client = genai.Client(
11
+ api_key=google_api,
12
+ )
13
+
14
+ model = "gemini-2.0-flash"
15
+ contents = [
16
+ types.Content(
17
+ role="user",
18
+ parts=[
19
+ types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
20
+ There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
21
+ Please correct the errors and extract both metadata and structured lab test data.
22
+ ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
23
+ AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
24
+ Return the output in structured JSON format with all the information in lowercase to standardization.
25
+ And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
26
+
27
+
28
+ Extracted Text:
29
+ Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
30
+
31
+ Expected JSON format:
32
+ {
33
+ \"metadata\": {
34
+ \"patient_name\": \"Prasahsst Pawar\",
35
+ \"age\": \"20\",
36
+ \"gender\": \"Male\",
37
+ \"lab_name\": \"XYZ Diagnostics\",
38
+ \"report_date\": \"05-03-2025\"
39
+ },
40
+ \"lab_tests\": [
41
+ {
42
+ \"test_name\": \"hemoglobin\",
43
+ \"value\": \"14.2\",
44
+ \"unit\": \"g/dL\",
45
+ \"reference_range\": \"13.5 - 17.5 g/dL\"
46
+ },
47
+ {
48
+ \"test_name\": \"rbc count\",
49
+ \"value\": \"5.2\",
50
+ \"unit\": \"million/cu mm\",
51
+ \"reference_range\": \"4.1-5.1\"
52
+ },
53
+ {
54
+ \"test_name\": \"glucose\",
55
+ \"value\": \"65\",
56
+ \"unit\": \"mg/dL\",
57
+ \"reference_range\": \"70 - 110 mg/dL\"
58
+ }
59
+ ],
60
+ \"reds\":{
61
+ \"low\":[\"glucose\"],
62
+ \"high\":[\"rbc count\"]
63
+ }
64
+ }"""),
65
+ ],
66
+ ),
67
+ types.Content(
68
+ role="model",
69
+ parts=[
70
+ types.Part.from_text(text="""{
71
+ \"lab_tests\": [
72
+ {
73
+ \"reference_range\": \"12.0 - 17.0\",
74
+ \"test_name\": \"haemoglobin\",
75
+ \"unit\": \"gms/dl\",
76
+ \"value\": \"14\"
77
+ },
78
+ {
79
+ \"reference_range\": \"4.1-5.1\",
80
+ \"test_name\": \"rbc count\",
81
+ \"unit\": \"mill/cu mm\",
82
+ \"value\": \"4.4\"
83
+ },
84
+ {
85
+ \"reference_range\": \"32.0 - 47.0\",
86
+ \"test_name\": \"haematocrit (pcv)\",
87
+ \"unit\": \"%\",
88
+ \"value\": \"30\"
89
+ },
90
+ {
91
+ \"reference_range\": \"76.0 - 100.0\",
92
+ \"test_name\": \"mcv\",
93
+ \"unit\": \"fl\",
94
+ \"value\": \"78\"
95
+ },
96
+ {
97
+ \"reference_range\": \"26.0-32.0\",
98
+ \"test_name\": \"mch\",
99
+ \"unit\": \"pg\",
100
+ \"value\": \"32.46\"
101
+ },
102
+ {
103
+ \"reference_range\": \"31.5-34.5\",
104
+ \"test_name\": \"mchc\",
105
+ \"unit\": \"%\",
106
+ \"value\": \"32.8\"
107
+ },
108
+ {
109
+ \"reference_range\": \"11.6-15.0\",
110
+ \"test_name\": \"rdw\",
111
+ \"unit\": \"%\",
112
+ \"value\": \"13.9\"
113
+ },
114
+ {
115
+ \"reference_range\": \"6.8- 12.6\",
116
+ \"test_name\": \"mpv\",
117
+ \"unit\": \"fn\",
118
+ \"value\": \"11.2\"
119
+ },
120
+ {
121
+ \"reference_range\": \"4000 - 11000\",
122
+ \"test_name\": \"wbc count\",
123
+ \"unit\": \"/cu mm\",
124
+ \"value\": \"4567\"
125
+ },
126
+ {
127
+ \"reference_range\": \"40-70\",
128
+ \"test_name\": \"neutrophils\",
129
+ \"unit\": \"%\",
130
+ \"value\": \"56\"
131
+ },
132
+ {
133
+ \"reference_range\": \"20.0- 45.0\",
134
+ \"test_name\": \"lymphocytes\",
135
+ \"unit\": \"%\",
136
+ \"value\": \"20\"
137
+ },
138
+ {
139
+ \"reference_range\": \"0-6\",
140
+ \"test_name\": \"eosinophils\",
141
+ \"unit\": \"%\",
142
+ \"value\": \"4\"
143
+ },
144
+ {
145
+ \"reference_range\": \"2-10\",
146
+ \"test_name\": \"monocytes\",
147
+ \"unit\": \"%\",
148
+ \"value\": \"5\"
149
+ }
150
+ ],
151
+ \"metadata\": {
152
+ \"age\": \"40\",
153
+ \"gender\": \"male\",
154
+ \"lab_name\": \"sanjeevan hospital\",
155
+ \"patient_name\": \"amar shaha\",
156
+ \"report_date\": \"09-jul-20\"
157
+ },
158
+ \"reds\": {
159
+ \"high\": [
160
+ \"mch\"
161
+ ],
162
+ \"low\": [
163
+ \"haematocrit (pcv)\"
164
+ ]
165
+ }
166
+ }"""),
167
+ ],
168
+ ),
169
+ types.Content(
170
+ role="user",
171
+ parts=[
172
+ types.Part.from_text(text=extracted_text),
173
+ ],
174
+ ),
175
+ ]
176
+ generate_content_config = types.GenerateContentConfig(
177
+ temperature=1,
178
+ top_p=0.95,
179
+ top_k=40,
180
+ max_output_tokens=8192,
181
+ response_mime_type="application/json",
182
+ response_schema=genai.types.Schema(
183
+ type = genai.types.Type.OBJECT,
184
+ enum = [],
185
+ required = ["metadata", "lab_tests", "reds"],
186
+ properties = {
187
+ "metadata": genai.types.Schema(
188
+ type = genai.types.Type.OBJECT,
189
+ enum = [],
190
+ required = ["patient_name", "age", "gender", "lab_name", "report_date"],
191
+ properties = {
192
+ "patient_name": genai.types.Schema(
193
+ type = genai.types.Type.STRING,
194
+ ),
195
+ "age": genai.types.Schema(
196
+ type = genai.types.Type.STRING,
197
+ ),
198
+ "gender": genai.types.Schema(
199
+ type = genai.types.Type.STRING,
200
+ ),
201
+ "lab_name": genai.types.Schema(
202
+ type = genai.types.Type.STRING,
203
+ ),
204
+ "report_date": genai.types.Schema(
205
+ type = genai.types.Type.STRING,
206
+ ),
207
+ },
208
+ ),
209
+ "lab_tests": genai.types.Schema(
210
+ type = genai.types.Type.ARRAY,
211
+ items = genai.types.Schema(
212
+ type = genai.types.Type.OBJECT,
213
+ enum = [],
214
+ required = ["test_name", "value", "unit", "reference_range"],
215
+ properties = {
216
+ "test_name": genai.types.Schema(
217
+ type = genai.types.Type.STRING,
218
+ ),
219
+ "value": genai.types.Schema(
220
+ type = genai.types.Type.STRING,
221
+ ),
222
+ "unit": genai.types.Schema(
223
+ type = genai.types.Type.STRING,
224
+ ),
225
+ "reference_range": genai.types.Schema(
226
+ type = genai.types.Type.STRING,
227
+ ),
228
+ },
229
+ ),
230
+ ),
231
+ "reds": genai.types.Schema(
232
+ type = genai.types.Type.OBJECT,
233
+ enum = [],
234
+ required = ["low", "high"],
235
+ properties = {
236
+ "low": genai.types.Schema(
237
+ type = genai.types.Type.ARRAY,
238
+ items = genai.types.Schema(
239
+ type = genai.types.Type.STRING,
240
+ ),
241
+ ),
242
+ "high": genai.types.Schema(
243
+ type = genai.types.Type.ARRAY,
244
+ items = genai.types.Schema(
245
+ type = genai.types.Type.STRING,
246
+ ),
247
+ ),
248
+ },
249
+ ),
250
+ },
251
+ ),
252
+ system_instruction=[
253
+ types.Part.from_text(text="""Always return the output as JSON only"""),
254
+ ],
255
+ )
256
+
257
+
258
+
259
+ # for chunk in client.models.generate_content_stream(
260
+ # model=model,
261
+ # contents=contents,
262
+ # config=generate_content_config,
263
+ # ):
264
+ # print(chunk.text, end="")
265
+
266
+ try:
267
+ response = client.models.generate_content(
268
+ model=model, contents=contents, config=generate_content_config
269
+ )
270
+
271
+ json_response = response.text # Ensure response is JSON formatted
272
+ parsed_json = json.loads(json_response) # Convert JSON string to Python dictionary
273
+ return parsed_json
274
+
275
+ except json.JSONDecodeError:
276
+ print("Error: Invalid JSON response from the model.")
277
+ return None
278
+