Prashasst commited on
Commit
52bb493
·
verified ·
1 Parent(s): b73ad8a

Update app.py

Browse files

Added JSON to api

Files changed (1) hide show
  1. app.py +301 -20
app.py CHANGED
@@ -1,30 +1,305 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import os
 
 
 
 
 
 
4
  googel_api=os.getenv("google_api")
5
 
6
 
7
 
8
- # Your existing functions: read_pdf, generate, showdata
9
  def read_pdf(pdf_path):
10
- # Implement PDF reading logic here
11
- return "Extracted text from PDF"
12
-
13
- def generate(text):
14
- # Implement JSON generation logic here
15
- return {
16
- "metadata": {
17
- "patient_name": "Amar Shaha",
18
- "age": "40",
19
- "gender": "Male",
20
- "lab_name": "Sanjeevan Hospital",
21
- "report_date": "09-Jul-2020"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
- "lab_tests": [
24
- {"test_name": "hemoglobin", "value": "14", "unit": "g/dL", "reference_range": "12.0 - 17.0"},
25
- {"test_name": "rbc count", "value": "4.4", "unit": "million/cu mm", "reference_range": "4.1 - 5.1"}
26
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def showdata(lab_tests):
30
  df = pd.DataFrame(lab_tests)
@@ -33,17 +308,19 @@ def showdata(lab_tests):
33
  # Gradio interface function
34
  def process_pdf(pdf):
35
  text = read_pdf(pdf.name) # Extract text from PDF
36
- json_data = generate(text) # Generate structured JSON
37
 
 
38
  metadata = json_data["metadata"]
 
 
39
  metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
40
  f"**Age:** {metadata['age']}\n\n" \
41
  f"**Gender:** {metadata['gender']}\n\n" \
42
  f"**Lab Name:** {metadata['lab_name']}\n\n" \
43
  f"**Report Date:** {metadata['report_date']}"
44
 
45
- lab_tests_df = showdata(json_data["lab_tests"]) # Convert lab test results to DataFrame
46
- return metadata_str, lab_tests_df
47
 
48
  # Define Gradio interface
49
  with gr.Blocks() as demo:
@@ -58,5 +335,9 @@ with gr.Blocks() as demo:
58
 
59
  submit_btn.click(process_pdf, inputs=[pdf_input], outputs=[metadata_output, lab_test_output])
60
 
 
 
 
 
61
  # Launch the app
62
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
+ import fitz
5
+ import pytesseract
6
+ import base64
7
+ from google import genai
8
+ from google.genai import types
9
+
10
  googel_api=os.getenv("google_api")
11
 
12
 
13
 
 
14
  def read_pdf(pdf_path):
15
+ text = ""
16
+ doc = fitz.open(pdf_path)
17
+ for page_num in range(len(doc)):
18
+ page = doc.load_page(page_num)
19
+ page_text = page.get_text()
20
+ if page_text.strip():
21
+ text += page_text + "\n"
22
+ else:
23
+ # print(f"Image found in Page {page_num + 1} Performing OCR...")
24
+ images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
25
+ for img in images:
26
+ text += pytesseract.image_to_string(img) + "\n"
27
+ # print(f"Extracted text preview:\n{text[:600]}...")
28
+ return text.strip()
29
+
30
+
31
+
32
+
33
+ def generate(extracted_text):
34
+ client = genai.Client(
35
+ api_key=google_api,
36
+ )
37
+
38
+ model = "gemini-2.0-flash"
39
+ contents = [
40
+ types.Content(
41
+ role="user",
42
+ parts=[
43
+ types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
44
+ There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
45
+ Please correct the errors and extract both metadata and structured lab test data.
46
+ ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
47
+ AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
48
+ Return the output in structured JSON format with all the information in lowercase to standardization.
49
+ And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
50
+
51
+
52
+ Extracted Text:
53
+ Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
54
+
55
+ Expected JSON format:
56
+ {
57
+ \"metadata\": {
58
+ \"patient_name\": \"Prasahsst Pawar\",
59
+ \"age\": \"20\",
60
+ \"gender\": \"Male\",
61
+ \"lab_name\": \"XYZ Diagnostics\",
62
+ \"report_date\": \"05-03-2025\"
63
+ },
64
+ \"lab_tests\": [
65
+ {
66
+ \"test_name\": \"hemoglobin\",
67
+ \"value\": \"14.2\",
68
+ \"unit\": \"g/dL\",
69
+ \"reference_range\": \"13.5 - 17.5 g/dL\"
70
  },
71
+ {
72
+ \"test_name\": \"rbc count\",
73
+ \"value\": \"5.2\",
74
+ \"unit\": \"million/cu mm\",
75
+ \"reference_range\": \"4.1-5.1\"
76
+ },
77
+ {
78
+ \"test_name\": \"glucose\",
79
+ \"value\": \"65\",
80
+ \"unit\": \"mg/dL\",
81
+ \"reference_range\": \"70 - 110 mg/dL\"
82
+ }
83
+ ],
84
+ \"reds\":{
85
+ \"low\":[\"glucose\"],
86
+ \"high\":[\"rbc count\"]
87
+ }
88
+ }"""),
89
+ ],
90
+ ),
91
+ types.Content(
92
+ role="model",
93
+ parts=[
94
+ types.Part.from_text(text="""{
95
+ \"lab_tests\": [
96
+ {
97
+ \"reference_range\": \"12.0 - 17.0\",
98
+ \"test_name\": \"haemoglobin\",
99
+ \"unit\": \"gms/dl\",
100
+ \"value\": \"14\"
101
+ },
102
+ {
103
+ \"reference_range\": \"4.1-5.1\",
104
+ \"test_name\": \"rbc count\",
105
+ \"unit\": \"mill/cu mm\",
106
+ \"value\": \"4.4\"
107
+ },
108
+ {
109
+ \"reference_range\": \"32.0 - 47.0\",
110
+ \"test_name\": \"haematocrit (pcv)\",
111
+ \"unit\": \"%\",
112
+ \"value\": \"30\"
113
+ },
114
+ {
115
+ \"reference_range\": \"76.0 - 100.0\",
116
+ \"test_name\": \"mcv\",
117
+ \"unit\": \"fl\",
118
+ \"value\": \"78\"
119
+ },
120
+ {
121
+ \"reference_range\": \"26.0-32.0\",
122
+ \"test_name\": \"mch\",
123
+ \"unit\": \"pg\",
124
+ \"value\": \"32.46\"
125
+ },
126
+ {
127
+ \"reference_range\": \"31.5-34.5\",
128
+ \"test_name\": \"mchc\",
129
+ \"unit\": \"%\",
130
+ \"value\": \"32.8\"
131
+ },
132
+ {
133
+ \"reference_range\": \"11.6-15.0\",
134
+ \"test_name\": \"rdw\",
135
+ \"unit\": \"%\",
136
+ \"value\": \"13.9\"
137
+ },
138
+ {
139
+ \"reference_range\": \"6.8- 12.6\",
140
+ \"test_name\": \"mpv\",
141
+ \"unit\": \"fn\",
142
+ \"value\": \"11.2\"
143
+ },
144
+ {
145
+ \"reference_range\": \"4000 - 11000\",
146
+ \"test_name\": \"wbc count\",
147
+ \"unit\": \"/cu mm\",
148
+ \"value\": \"4567\"
149
+ },
150
+ {
151
+ \"reference_range\": \"40-70\",
152
+ \"test_name\": \"neutrophils\",
153
+ \"unit\": \"%\",
154
+ \"value\": \"56\"
155
+ },
156
+ {
157
+ \"reference_range\": \"20.0- 45.0\",
158
+ \"test_name\": \"lymphocytes\",
159
+ \"unit\": \"%\",
160
+ \"value\": \"20\"
161
+ },
162
+ {
163
+ \"reference_range\": \"0-6\",
164
+ \"test_name\": \"eosinophils\",
165
+ \"unit\": \"%\",
166
+ \"value\": \"4\"
167
+ },
168
+ {
169
+ \"reference_range\": \"2-10\",
170
+ \"test_name\": \"monocytes\",
171
+ \"unit\": \"%\",
172
+ \"value\": \"5\"
173
  }
174
+ ],
175
+ \"metadata\": {
176
+ \"age\": \"40\",
177
+ \"gender\": \"male\",
178
+ \"lab_name\": \"sanjeevan hospital\",
179
+ \"patient_name\": \"amar shaha\",
180
+ \"report_date\": \"09-jul-20\"
181
+ },
182
+ \"reds\": {
183
+ \"high\": [
184
+ \"mch\"
185
+ ],
186
+ \"low\": [
187
+ \"haematocrit (pcv)\"
188
+ ]
189
+ }
190
+ }"""),
191
+ ],
192
+ ),
193
+ types.Content(
194
+ role="user",
195
+ parts=[
196
+ types.Part.from_text(text=extracted_text),
197
+ ],
198
+ ),
199
+ ]
200
+ generate_content_config = types.GenerateContentConfig(
201
+ temperature=1,
202
+ top_p=0.95,
203
+ top_k=40,
204
+ max_output_tokens=8192,
205
+ response_mime_type="application/json",
206
+ response_schema=genai.types.Schema(
207
+ type = genai.types.Type.OBJECT,
208
+ enum = [],
209
+ required = ["metadata", "lab_tests", "reds"],
210
+ properties = {
211
+ "metadata": genai.types.Schema(
212
+ type = genai.types.Type.OBJECT,
213
+ enum = [],
214
+ required = ["patient_name", "age", "gender", "lab_name", "report_date"],
215
+ properties = {
216
+ "patient_name": genai.types.Schema(
217
+ type = genai.types.Type.STRING,
218
+ ),
219
+ "age": genai.types.Schema(
220
+ type = genai.types.Type.STRING,
221
+ ),
222
+ "gender": genai.types.Schema(
223
+ type = genai.types.Type.STRING,
224
+ ),
225
+ "lab_name": genai.types.Schema(
226
+ type = genai.types.Type.STRING,
227
+ ),
228
+ "report_date": genai.types.Schema(
229
+ type = genai.types.Type.STRING,
230
+ ),
231
+ },
232
+ ),
233
+ "lab_tests": genai.types.Schema(
234
+ type = genai.types.Type.ARRAY,
235
+ items = genai.types.Schema(
236
+ type = genai.types.Type.OBJECT,
237
+ enum = [],
238
+ required = ["test_name", "value", "unit", "reference_range"],
239
+ properties = {
240
+ "test_name": genai.types.Schema(
241
+ type = genai.types.Type.STRING,
242
+ ),
243
+ "value": genai.types.Schema(
244
+ type = genai.types.Type.STRING,
245
+ ),
246
+ "unit": genai.types.Schema(
247
+ type = genai.types.Type.STRING,
248
+ ),
249
+ "reference_range": genai.types.Schema(
250
+ type = genai.types.Type.STRING,
251
+ ),
252
+ },
253
+ ),
254
+ ),
255
+ "reds": genai.types.Schema(
256
+ type = genai.types.Type.OBJECT,
257
+ enum = [],
258
+ required = ["low", "high"],
259
+ properties = {
260
+ "low": genai.types.Schema(
261
+ type = genai.types.Type.ARRAY,
262
+ items = genai.types.Schema(
263
+ type = genai.types.Type.STRING,
264
+ ),
265
+ ),
266
+ "high": genai.types.Schema(
267
+ type = genai.types.Type.ARRAY,
268
+ items = genai.types.Schema(
269
+ type = genai.types.Type.STRING,
270
+ ),
271
+ ),
272
+ },
273
+ ),
274
+ },
275
+ ),
276
+ system_instruction=[
277
+ types.Part.from_text(text="""Always return the output as JSON only"""),
278
+ ],
279
+ )
280
+
281
+
282
+
283
+ # for chunk in client.models.generate_content_stream(
284
+ # model=model,
285
+ # contents=contents,
286
+ # config=generate_content_config,
287
+ # ):
288
+ # print(chunk.text, end="")
289
+
290
+
291
+ response = client.models.generate_content(
292
+ model=model,
293
+ contents=contents,
294
+ config=generate_content_config,
295
+ )
296
+
297
+ json_response = response.text # The API should return JSON text
298
+ parsed_json = json.loads(json_response) # Convert JSON string to Python dictionary
299
+
300
+ return parsed_json
301
+
302
+
303
 
304
  def showdata(lab_tests):
305
  df = pd.DataFrame(lab_tests)
 
308
  # Gradio interface function
309
  def process_pdf(pdf):
310
  text = read_pdf(pdf.name) # Extract text from PDF
311
+ output = generate(text) # Generate structured JSON
312
 
313
+ labtests=pd.DataFrame(output["lab_tests"])
314
  metadata = json_data["metadata"]
315
+ reds=pd.DataFrame(output["reds"])
316
+
317
  metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
318
  f"**Age:** {metadata['age']}\n\n" \
319
  f"**Gender:** {metadata['gender']}\n\n" \
320
  f"**Lab Name:** {metadata['lab_name']}\n\n" \
321
  f"**Report Date:** {metadata['report_date']}"
322
 
323
+ return metadata_str, labtests, output
 
324
 
325
  # Define Gradio interface
326
  with gr.Blocks() as demo:
 
335
 
336
  submit_btn.click(process_pdf, inputs=[pdf_input], outputs=[metadata_output, lab_test_output])
337
 
338
+ # Add API access but only expose JSON
339
+ demo.api(process_pdf, inputs=[gr.File(type="file")], outputs=gr.JSON(), route="/process")
340
+
341
+
342
  # Launch the app
343
  demo.launch()