Prashasst commited on
Commit
4f7c634
Β·
verified Β·
1 Parent(s): ef0811b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -365
app.py CHANGED
@@ -1,357 +1,21 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import fitz # PyMuPDF
4
- import pytesseract
5
- from pdf2image import convert_from_path
6
- import os
7
- import base64
8
- from google import genai
9
- from google.genai import types
10
-
11
- google_api=os.getenv("google_api")
12
-
13
-
14
-
15
-
16
- def read_pdf(pdf_path):
17
- text = ""
18
- doc = fitz.open(pdf_path)
19
-
20
- for page_num in range(len(doc)):
21
- page = doc.load_page(page_num)
22
- page_text = page.get_text("text").strip() # Extract text from page
23
-
24
- # Extract Images for OCR
25
- images = page.get_images(full=True) # Check if the page has images
26
-
27
- ocr_text = ""
28
- if images: # If images exist, process them
29
- print(f"Page {page_num + 1} contains images, performing OCR...")
30
- img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
31
-
32
- for img in img_pages:
33
- ocr_text += pytesseract.image_to_string(img).strip() + "\n"
34
-
35
- # Combine both text extraction methods
36
- combined_text = f"{page_text}\n{ocr_text}".strip()
37
-
38
- if combined_text:
39
- text += combined_text + "\n\n"
40
-
41
- return text.strip()
42
-
43
-
44
-
45
-
46
-
47
-
48
- def generate(extracted_text):
49
- client = genai.Client(
50
- api_key=google_api,
51
- )
52
-
53
- model = "gemini-2.0-flash"
54
- contents = [
55
- types.Content(
56
- role="user",
57
- parts=[
58
- types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
59
- There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
60
- Please correct the errors and extract both metadata and structured lab test data.
61
- ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
62
- AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
63
- Return the output in structured JSON format with all the information in lowercase to standardization.
64
- And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
65
-
66
-
67
- Extracted Text:
68
- Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |Β» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 Β« % 0-6\\nMONOCYTES : 5 %
69
-
70
- Expected JSON format:
71
- {
72
- \"metadata\": {
73
- \"patient_name\": \"Prasahsst Pawar\",
74
- \"age\": \"20\",
75
- \"gender\": \"Male\",
76
- \"lab_name\": \"XYZ Diagnostics\",
77
- \"report_date\": \"05-03-2025\"
78
- },
79
- \"lab_tests\": [
80
- {
81
- \"test_name\": \"hemoglobin\",
82
- \"value\": \"14.2\",
83
- \"unit\": \"g/dL\",
84
- \"reference_range\": \"13.5 - 17.5 g/dL\"
85
- },
86
- {
87
- \"test_name\": \"rbc count\",
88
- \"value\": \"5.2\",
89
- \"unit\": \"million/cu mm\",
90
- \"reference_range\": \"4.1-5.1\"
91
- },
92
- {
93
- \"test_name\": \"glucose\",
94
- \"value\": \"65\",
95
- \"unit\": \"mg/dL\",
96
- \"reference_range\": \"70 - 110 mg/dL\"
97
- }
98
- ],
99
- \"reds\":{
100
- \"low\":[\"glucose\"],
101
- \"high\":[\"rbc count\"]
102
- }
103
- }"""),
104
- ],
105
- ),
106
- types.Content(
107
- role="model",
108
- parts=[
109
- types.Part.from_text(text="""{
110
- \"lab_tests\": [
111
- {
112
- \"reference_range\": \"12.0 - 17.0\",
113
- \"test_name\": \"haemoglobin\",
114
- \"unit\": \"gms/dl\",
115
- \"value\": \"14\"
116
- },
117
- {
118
- \"reference_range\": \"4.1-5.1\",
119
- \"test_name\": \"rbc count\",
120
- \"unit\": \"mill/cu mm\",
121
- \"value\": \"4.4\"
122
- },
123
- {
124
- \"reference_range\": \"32.0 - 47.0\",
125
- \"test_name\": \"haematocrit (pcv)\",
126
- \"unit\": \"%\",
127
- \"value\": \"30\"
128
- },
129
- {
130
- \"reference_range\": \"76.0 - 100.0\",
131
- \"test_name\": \"mcv\",
132
- \"unit\": \"fl\",
133
- \"value\": \"78\"
134
- },
135
- {
136
- \"reference_range\": \"26.0-32.0\",
137
- \"test_name\": \"mch\",
138
- \"unit\": \"pg\",
139
- \"value\": \"32.46\"
140
- },
141
- {
142
- \"reference_range\": \"31.5-34.5\",
143
- \"test_name\": \"mchc\",
144
- \"unit\": \"%\",
145
- \"value\": \"32.8\"
146
- },
147
- {
148
- \"reference_range\": \"11.6-15.0\",
149
- \"test_name\": \"rdw\",
150
- \"unit\": \"%\",
151
- \"value\": \"13.9\"
152
- },
153
- {
154
- \"reference_range\": \"6.8- 12.6\",
155
- \"test_name\": \"mpv\",
156
- \"unit\": \"fn\",
157
- \"value\": \"11.2\"
158
- },
159
- {
160
- \"reference_range\": \"4000 - 11000\",
161
- \"test_name\": \"wbc count\",
162
- \"unit\": \"/cu mm\",
163
- \"value\": \"4567\"
164
- },
165
- {
166
- \"reference_range\": \"40-70\",
167
- \"test_name\": \"neutrophils\",
168
- \"unit\": \"%\",
169
- \"value\": \"56\"
170
- },
171
- {
172
- \"reference_range\": \"20.0- 45.0\",
173
- \"test_name\": \"lymphocytes\",
174
- \"unit\": \"%\",
175
- \"value\": \"20\"
176
- },
177
- {
178
- \"reference_range\": \"0-6\",
179
- \"test_name\": \"eosinophils\",
180
- \"unit\": \"%\",
181
- \"value\": \"4\"
182
- },
183
- {
184
- \"reference_range\": \"2-10\",
185
- \"test_name\": \"monocytes\",
186
- \"unit\": \"%\",
187
- \"value\": \"5\"
188
- }
189
- ],
190
- \"metadata\": {
191
- \"age\": \"40\",
192
- \"gender\": \"male\",
193
- \"lab_name\": \"sanjeevan hospital\",
194
- \"patient_name\": \"amar shaha\",
195
- \"report_date\": \"09-jul-20\"
196
- },
197
- \"reds\": {
198
- \"high\": [
199
- \"mch\"
200
- ],
201
- \"low\": [
202
- \"haematocrit (pcv)\"
203
- ]
204
- }
205
- }"""),
206
- ],
207
- ),
208
- types.Content(
209
- role="user",
210
- parts=[
211
- types.Part.from_text(text=extracted_text),
212
- ],
213
- ),
214
- ]
215
- generate_content_config = types.GenerateContentConfig(
216
- temperature=1,
217
- top_p=0.95,
218
- top_k=40,
219
- max_output_tokens=8192,
220
- response_mime_type="application/json",
221
- response_schema=genai.types.Schema(
222
- type = genai.types.Type.OBJECT,
223
- enum = [],
224
- required = ["metadata", "lab_tests", "reds"],
225
- properties = {
226
- "metadata": genai.types.Schema(
227
- type = genai.types.Type.OBJECT,
228
- enum = [],
229
- required = ["patient_name", "age", "gender", "lab_name", "report_date"],
230
- properties = {
231
- "patient_name": genai.types.Schema(
232
- type = genai.types.Type.STRING,
233
- ),
234
- "age": genai.types.Schema(
235
- type = genai.types.Type.STRING,
236
- ),
237
- "gender": genai.types.Schema(
238
- type = genai.types.Type.STRING,
239
- ),
240
- "lab_name": genai.types.Schema(
241
- type = genai.types.Type.STRING,
242
- ),
243
- "report_date": genai.types.Schema(
244
- type = genai.types.Type.STRING,
245
- ),
246
- },
247
- ),
248
- "lab_tests": genai.types.Schema(
249
- type = genai.types.Type.ARRAY,
250
- items = genai.types.Schema(
251
- type = genai.types.Type.OBJECT,
252
- enum = [],
253
- required = ["test_name", "value", "unit", "reference_range"],
254
- properties = {
255
- "test_name": genai.types.Schema(
256
- type = genai.types.Type.STRING,
257
- ),
258
- "value": genai.types.Schema(
259
- type = genai.types.Type.STRING,
260
- ),
261
- "unit": genai.types.Schema(
262
- type = genai.types.Type.STRING,
263
- ),
264
- "reference_range": genai.types.Schema(
265
- type = genai.types.Type.STRING,
266
- ),
267
- },
268
- ),
269
- ),
270
- "reds": genai.types.Schema(
271
- type = genai.types.Type.OBJECT,
272
- enum = [],
273
- required = ["low", "high"],
274
- properties = {
275
- "low": genai.types.Schema(
276
- type = genai.types.Type.ARRAY,
277
- items = genai.types.Schema(
278
- type = genai.types.Type.STRING,
279
- ),
280
- ),
281
- "high": genai.types.Schema(
282
- type = genai.types.Type.ARRAY,
283
- items = genai.types.Schema(
284
- type = genai.types.Type.STRING,
285
- ),
286
- ),
287
- },
288
- ),
289
- },
290
- ),
291
- system_instruction=[
292
- types.Part.from_text(text="""Always return the output as JSON only"""),
293
- ],
294
- )
295
-
296
-
297
-
298
- # for chunk in client.models.generate_content_stream(
299
- # model=model,
300
- # contents=contents,
301
- # config=generate_content_config,
302
- # ):
303
- # print(chunk.text, end="")
304
-
305
-
306
- response = client.models.generate_content(
307
- model=model,
308
- contents=contents,
309
- config=generate_content_config,
310
- )
311
-
312
- json_response = response.text # The API should return JSON text
313
- parsed_json = json.loads(json_response) # Convert JSON string to Python dictionary
314
-
315
- return parsed_json
316
-
317
-
318
-
319
-
320
-
321
- # Gradio interface function
322
- def process_pdf(pdf):
323
- text = read_pdf(pdf) # Extract text from PDF
324
- # # print(text)
325
- output = generate(text) # Generate structured JSON
326
-
327
- return output
328
-
329
-
330
- def show_to_UI(pdf):
331
- output = process_pdf(pdf) # Call process_pdf to get JSON
332
-
333
- # Extract metadata
334
  metadata = output["metadata"]
335
- # labtests = pd.DataFrame(output["lab_tests"])
336
- # reds = pd.DataFrame(output["reds"])
337
 
338
- try:
339
- labtests = pd.DataFrame(output["lab_tests"],)
340
- except Exception as e:
341
- print(f"Error creating lab tests DataFrame: {e}")
342
- labtests = pd.DataFrame() # Return empty DataFrame
343
-
344
- try:
345
- highs = pd.DataFrame(output["reds"]["high"],index=True)
346
- except Exception as e:
347
- print(f"Error creating highs DataFrame: {e}")
348
- highs = pd.DataFrame() # Return empty DataFrame
349
-
350
- try:
351
- lows = pd.DataFrame(output["reds"]["low"],)
352
- except Exception as e:
353
- print(f"Error creating lowss DataFrame: {e}")
354
- lows = pd.DataFrame() # Return empty DataFrame
355
 
356
  metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
357
  f"**Age:** {metadata['age']}\n\n" \
@@ -359,27 +23,29 @@ def show_to_UI(pdf):
359
  f"**Lab Name:** {metadata['lab_name']}\n\n" \
360
  f"**Report Date:** {metadata['report_date']}"
361
 
362
-
363
-
364
- return metadata_str,highs,lows, labtests,output
365
-
366
-
367
 
 
368
 
369
- # Define Gradio interface
370
  with gr.Blocks() as demo:
371
- gr.Markdown("# Medical Lab Report Processor")
372
 
373
  with gr.Row():
374
- pdf_input = gr.File(label="Upload PDF Report")
375
  submit_btn = gr.Button("Process")
376
 
377
- metadata_output = gr.Markdown("**Patient Name: Prashasst...**")
 
378
  with gr.Row():
379
- high_output = gr.Dataframe(label="High Values")
380
- low_output = gr.Dataframe(label="Low Values")
381
- lab_test_output = gr.Dataframe(label="Lab Test Results")
382
- output_JSON = gr.JSON(label="Extracted Report") # Show JSON output
 
 
 
 
 
 
383
 
384
- submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output,lab_test_output, output_JSON])
385
- demo.launch(debug=True,share=True)
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from file_processing import FileProcessor
4
+ from entity_recognition import process_text
5
+ from utils import safe_dataframe
6
+
7
+ def show_to_UI(file):
8
+ """Processes the uploaded file and extracts medical data."""
9
+ processor = FileProcessor()
10
+ text = processor.process(file.name) # Read content
11
+ output = process_text(text) # Perform entity recognition
12
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  metadata = output["metadata"]
 
 
14
 
15
+ # Convert extracted data safely
16
+ highs = safe_dataframe(output["reds"], "high")
17
+ lows = safe_dataframe(output["reds"], "low")
18
+ labtests = safe_dataframe(output, "lab_tests")
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
21
  f"**Age:** {metadata['age']}\n\n" \
 
23
  f"**Lab Name:** {metadata['lab_name']}\n\n" \
24
  f"**Report Date:** {metadata['report_date']}"
25
 
26
+ print(f"Processed report for {metadata['patient_name']}")
 
 
 
 
27
 
28
+ return metadata_str, highs, lows, labtests, output
29
 
30
+ # βœ… Gradio Interface
31
  with gr.Blocks() as demo:
32
+ gr.Markdown("# πŸ₯ Medical Lab Report Processor")
33
 
34
  with gr.Row():
35
+ pdf_input = gr.File(label="πŸ“‚ Upload Report")
36
  submit_btn = gr.Button("Process")
37
 
38
+ metadata_output = gr.Markdown("**Patient Name: Loading...**")
39
+
40
  with gr.Row():
41
+ high_output = gr.Dataframe(label="πŸ”Ί High Values")
42
+ low_output = gr.Dataframe(label="πŸ”» Low Values")
43
+
44
+ lab_test_output = gr.Dataframe(label="πŸ“Š Lab Test Results")
45
+ output_JSON = gr.JSON(label="πŸ“œ Extracted Report")
46
+
47
+ submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output, lab_test_output, output_JSON])
48
+
49
+ demo.launch()
50
+
51