Update app.py
Browse files
app.py
CHANGED
@@ -1,357 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
import
|
4 |
-
import
|
5 |
-
from
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
def read_pdf(pdf_path):
|
17 |
-
text = ""
|
18 |
-
doc = fitz.open(pdf_path)
|
19 |
-
|
20 |
-
for page_num in range(len(doc)):
|
21 |
-
page = doc.load_page(page_num)
|
22 |
-
page_text = page.get_text("text").strip() # Extract text from page
|
23 |
-
|
24 |
-
# Extract Images for OCR
|
25 |
-
images = page.get_images(full=True) # Check if the page has images
|
26 |
-
|
27 |
-
ocr_text = ""
|
28 |
-
if images: # If images exist, process them
|
29 |
-
print(f"Page {page_num + 1} contains images, performing OCR...")
|
30 |
-
img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
|
31 |
-
|
32 |
-
for img in img_pages:
|
33 |
-
ocr_text += pytesseract.image_to_string(img).strip() + "\n"
|
34 |
-
|
35 |
-
# Combine both text extraction methods
|
36 |
-
combined_text = f"{page_text}\n{ocr_text}".strip()
|
37 |
-
|
38 |
-
if combined_text:
|
39 |
-
text += combined_text + "\n\n"
|
40 |
-
|
41 |
-
return text.strip()
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
def generate(extracted_text):
|
49 |
-
client = genai.Client(
|
50 |
-
api_key=google_api,
|
51 |
-
)
|
52 |
-
|
53 |
-
model = "gemini-2.0-flash"
|
54 |
-
contents = [
|
55 |
-
types.Content(
|
56 |
-
role="user",
|
57 |
-
parts=[
|
58 |
-
types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
|
59 |
-
There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
|
60 |
-
Please correct the errors and extract both metadata and structured lab test data.
|
61 |
-
ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
|
62 |
-
AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
|
63 |
-
Return the output in structured JSON format with all the information in lowercase to standardization.
|
64 |
-
And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
|
65 |
-
|
66 |
-
|
67 |
-
Extracted Text:
|
68 |
-
Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |Β» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 Β« % 0-6\\nMONOCYTES : 5 %
|
69 |
-
|
70 |
-
Expected JSON format:
|
71 |
-
{
|
72 |
-
\"metadata\": {
|
73 |
-
\"patient_name\": \"Prasahsst Pawar\",
|
74 |
-
\"age\": \"20\",
|
75 |
-
\"gender\": \"Male\",
|
76 |
-
\"lab_name\": \"XYZ Diagnostics\",
|
77 |
-
\"report_date\": \"05-03-2025\"
|
78 |
-
},
|
79 |
-
\"lab_tests\": [
|
80 |
-
{
|
81 |
-
\"test_name\": \"hemoglobin\",
|
82 |
-
\"value\": \"14.2\",
|
83 |
-
\"unit\": \"g/dL\",
|
84 |
-
\"reference_range\": \"13.5 - 17.5 g/dL\"
|
85 |
-
},
|
86 |
-
{
|
87 |
-
\"test_name\": \"rbc count\",
|
88 |
-
\"value\": \"5.2\",
|
89 |
-
\"unit\": \"million/cu mm\",
|
90 |
-
\"reference_range\": \"4.1-5.1\"
|
91 |
-
},
|
92 |
-
{
|
93 |
-
\"test_name\": \"glucose\",
|
94 |
-
\"value\": \"65\",
|
95 |
-
\"unit\": \"mg/dL\",
|
96 |
-
\"reference_range\": \"70 - 110 mg/dL\"
|
97 |
-
}
|
98 |
-
],
|
99 |
-
\"reds\":{
|
100 |
-
\"low\":[\"glucose\"],
|
101 |
-
\"high\":[\"rbc count\"]
|
102 |
-
}
|
103 |
-
}"""),
|
104 |
-
],
|
105 |
-
),
|
106 |
-
types.Content(
|
107 |
-
role="model",
|
108 |
-
parts=[
|
109 |
-
types.Part.from_text(text="""{
|
110 |
-
\"lab_tests\": [
|
111 |
-
{
|
112 |
-
\"reference_range\": \"12.0 - 17.0\",
|
113 |
-
\"test_name\": \"haemoglobin\",
|
114 |
-
\"unit\": \"gms/dl\",
|
115 |
-
\"value\": \"14\"
|
116 |
-
},
|
117 |
-
{
|
118 |
-
\"reference_range\": \"4.1-5.1\",
|
119 |
-
\"test_name\": \"rbc count\",
|
120 |
-
\"unit\": \"mill/cu mm\",
|
121 |
-
\"value\": \"4.4\"
|
122 |
-
},
|
123 |
-
{
|
124 |
-
\"reference_range\": \"32.0 - 47.0\",
|
125 |
-
\"test_name\": \"haematocrit (pcv)\",
|
126 |
-
\"unit\": \"%\",
|
127 |
-
\"value\": \"30\"
|
128 |
-
},
|
129 |
-
{
|
130 |
-
\"reference_range\": \"76.0 - 100.0\",
|
131 |
-
\"test_name\": \"mcv\",
|
132 |
-
\"unit\": \"fl\",
|
133 |
-
\"value\": \"78\"
|
134 |
-
},
|
135 |
-
{
|
136 |
-
\"reference_range\": \"26.0-32.0\",
|
137 |
-
\"test_name\": \"mch\",
|
138 |
-
\"unit\": \"pg\",
|
139 |
-
\"value\": \"32.46\"
|
140 |
-
},
|
141 |
-
{
|
142 |
-
\"reference_range\": \"31.5-34.5\",
|
143 |
-
\"test_name\": \"mchc\",
|
144 |
-
\"unit\": \"%\",
|
145 |
-
\"value\": \"32.8\"
|
146 |
-
},
|
147 |
-
{
|
148 |
-
\"reference_range\": \"11.6-15.0\",
|
149 |
-
\"test_name\": \"rdw\",
|
150 |
-
\"unit\": \"%\",
|
151 |
-
\"value\": \"13.9\"
|
152 |
-
},
|
153 |
-
{
|
154 |
-
\"reference_range\": \"6.8- 12.6\",
|
155 |
-
\"test_name\": \"mpv\",
|
156 |
-
\"unit\": \"fn\",
|
157 |
-
\"value\": \"11.2\"
|
158 |
-
},
|
159 |
-
{
|
160 |
-
\"reference_range\": \"4000 - 11000\",
|
161 |
-
\"test_name\": \"wbc count\",
|
162 |
-
\"unit\": \"/cu mm\",
|
163 |
-
\"value\": \"4567\"
|
164 |
-
},
|
165 |
-
{
|
166 |
-
\"reference_range\": \"40-70\",
|
167 |
-
\"test_name\": \"neutrophils\",
|
168 |
-
\"unit\": \"%\",
|
169 |
-
\"value\": \"56\"
|
170 |
-
},
|
171 |
-
{
|
172 |
-
\"reference_range\": \"20.0- 45.0\",
|
173 |
-
\"test_name\": \"lymphocytes\",
|
174 |
-
\"unit\": \"%\",
|
175 |
-
\"value\": \"20\"
|
176 |
-
},
|
177 |
-
{
|
178 |
-
\"reference_range\": \"0-6\",
|
179 |
-
\"test_name\": \"eosinophils\",
|
180 |
-
\"unit\": \"%\",
|
181 |
-
\"value\": \"4\"
|
182 |
-
},
|
183 |
-
{
|
184 |
-
\"reference_range\": \"2-10\",
|
185 |
-
\"test_name\": \"monocytes\",
|
186 |
-
\"unit\": \"%\",
|
187 |
-
\"value\": \"5\"
|
188 |
-
}
|
189 |
-
],
|
190 |
-
\"metadata\": {
|
191 |
-
\"age\": \"40\",
|
192 |
-
\"gender\": \"male\",
|
193 |
-
\"lab_name\": \"sanjeevan hospital\",
|
194 |
-
\"patient_name\": \"amar shaha\",
|
195 |
-
\"report_date\": \"09-jul-20\"
|
196 |
-
},
|
197 |
-
\"reds\": {
|
198 |
-
\"high\": [
|
199 |
-
\"mch\"
|
200 |
-
],
|
201 |
-
\"low\": [
|
202 |
-
\"haematocrit (pcv)\"
|
203 |
-
]
|
204 |
-
}
|
205 |
-
}"""),
|
206 |
-
],
|
207 |
-
),
|
208 |
-
types.Content(
|
209 |
-
role="user",
|
210 |
-
parts=[
|
211 |
-
types.Part.from_text(text=extracted_text),
|
212 |
-
],
|
213 |
-
),
|
214 |
-
]
|
215 |
-
generate_content_config = types.GenerateContentConfig(
|
216 |
-
temperature=1,
|
217 |
-
top_p=0.95,
|
218 |
-
top_k=40,
|
219 |
-
max_output_tokens=8192,
|
220 |
-
response_mime_type="application/json",
|
221 |
-
response_schema=genai.types.Schema(
|
222 |
-
type = genai.types.Type.OBJECT,
|
223 |
-
enum = [],
|
224 |
-
required = ["metadata", "lab_tests", "reds"],
|
225 |
-
properties = {
|
226 |
-
"metadata": genai.types.Schema(
|
227 |
-
type = genai.types.Type.OBJECT,
|
228 |
-
enum = [],
|
229 |
-
required = ["patient_name", "age", "gender", "lab_name", "report_date"],
|
230 |
-
properties = {
|
231 |
-
"patient_name": genai.types.Schema(
|
232 |
-
type = genai.types.Type.STRING,
|
233 |
-
),
|
234 |
-
"age": genai.types.Schema(
|
235 |
-
type = genai.types.Type.STRING,
|
236 |
-
),
|
237 |
-
"gender": genai.types.Schema(
|
238 |
-
type = genai.types.Type.STRING,
|
239 |
-
),
|
240 |
-
"lab_name": genai.types.Schema(
|
241 |
-
type = genai.types.Type.STRING,
|
242 |
-
),
|
243 |
-
"report_date": genai.types.Schema(
|
244 |
-
type = genai.types.Type.STRING,
|
245 |
-
),
|
246 |
-
},
|
247 |
-
),
|
248 |
-
"lab_tests": genai.types.Schema(
|
249 |
-
type = genai.types.Type.ARRAY,
|
250 |
-
items = genai.types.Schema(
|
251 |
-
type = genai.types.Type.OBJECT,
|
252 |
-
enum = [],
|
253 |
-
required = ["test_name", "value", "unit", "reference_range"],
|
254 |
-
properties = {
|
255 |
-
"test_name": genai.types.Schema(
|
256 |
-
type = genai.types.Type.STRING,
|
257 |
-
),
|
258 |
-
"value": genai.types.Schema(
|
259 |
-
type = genai.types.Type.STRING,
|
260 |
-
),
|
261 |
-
"unit": genai.types.Schema(
|
262 |
-
type = genai.types.Type.STRING,
|
263 |
-
),
|
264 |
-
"reference_range": genai.types.Schema(
|
265 |
-
type = genai.types.Type.STRING,
|
266 |
-
),
|
267 |
-
},
|
268 |
-
),
|
269 |
-
),
|
270 |
-
"reds": genai.types.Schema(
|
271 |
-
type = genai.types.Type.OBJECT,
|
272 |
-
enum = [],
|
273 |
-
required = ["low", "high"],
|
274 |
-
properties = {
|
275 |
-
"low": genai.types.Schema(
|
276 |
-
type = genai.types.Type.ARRAY,
|
277 |
-
items = genai.types.Schema(
|
278 |
-
type = genai.types.Type.STRING,
|
279 |
-
),
|
280 |
-
),
|
281 |
-
"high": genai.types.Schema(
|
282 |
-
type = genai.types.Type.ARRAY,
|
283 |
-
items = genai.types.Schema(
|
284 |
-
type = genai.types.Type.STRING,
|
285 |
-
),
|
286 |
-
),
|
287 |
-
},
|
288 |
-
),
|
289 |
-
},
|
290 |
-
),
|
291 |
-
system_instruction=[
|
292 |
-
types.Part.from_text(text="""Always return the output as JSON only"""),
|
293 |
-
],
|
294 |
-
)
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
# for chunk in client.models.generate_content_stream(
|
299 |
-
# model=model,
|
300 |
-
# contents=contents,
|
301 |
-
# config=generate_content_config,
|
302 |
-
# ):
|
303 |
-
# print(chunk.text, end="")
|
304 |
-
|
305 |
-
|
306 |
-
response = client.models.generate_content(
|
307 |
-
model=model,
|
308 |
-
contents=contents,
|
309 |
-
config=generate_content_config,
|
310 |
-
)
|
311 |
-
|
312 |
-
json_response = response.text # The API should return JSON text
|
313 |
-
parsed_json = json.loads(json_response) # Convert JSON string to Python dictionary
|
314 |
-
|
315 |
-
return parsed_json
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
# Gradio interface function
|
322 |
-
def process_pdf(pdf):
|
323 |
-
text = read_pdf(pdf) # Extract text from PDF
|
324 |
-
# # print(text)
|
325 |
-
output = generate(text) # Generate structured JSON
|
326 |
-
|
327 |
-
return output
|
328 |
-
|
329 |
-
|
330 |
-
def show_to_UI(pdf):
|
331 |
-
output = process_pdf(pdf) # Call process_pdf to get JSON
|
332 |
-
|
333 |
-
# Extract metadata
|
334 |
metadata = output["metadata"]
|
335 |
-
# labtests = pd.DataFrame(output["lab_tests"])
|
336 |
-
# reds = pd.DataFrame(output["reds"])
|
337 |
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
labtests = pd.DataFrame() # Return empty DataFrame
|
343 |
-
|
344 |
-
try:
|
345 |
-
highs = pd.DataFrame(output["reds"]["high"],index=True)
|
346 |
-
except Exception as e:
|
347 |
-
print(f"Error creating highs DataFrame: {e}")
|
348 |
-
highs = pd.DataFrame() # Return empty DataFrame
|
349 |
-
|
350 |
-
try:
|
351 |
-
lows = pd.DataFrame(output["reds"]["low"],)
|
352 |
-
except Exception as e:
|
353 |
-
print(f"Error creating lowss DataFrame: {e}")
|
354 |
-
lows = pd.DataFrame() # Return empty DataFrame
|
355 |
|
356 |
metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
|
357 |
f"**Age:** {metadata['age']}\n\n" \
|
@@ -359,27 +23,29 @@ def show_to_UI(pdf):
|
|
359 |
f"**Lab Name:** {metadata['lab_name']}\n\n" \
|
360 |
f"**Report Date:** {metadata['report_date']}"
|
361 |
|
362 |
-
|
363 |
-
|
364 |
-
return metadata_str,highs,lows, labtests,output
|
365 |
-
|
366 |
-
|
367 |
|
|
|
368 |
|
369 |
-
#
|
370 |
with gr.Blocks() as demo:
|
371 |
-
gr.Markdown("# Medical Lab Report Processor")
|
372 |
|
373 |
with gr.Row():
|
374 |
-
pdf_input = gr.File(label="Upload
|
375 |
submit_btn = gr.Button("Process")
|
376 |
|
377 |
-
metadata_output = gr.Markdown("**Patient Name:
|
|
|
378 |
with gr.Row():
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
-
submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output,lab_test_output, output_JSON])
|
385 |
-
demo.launch(debug=True,share=True)
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
from file_processing import FileProcessor
|
4 |
+
from entity_recognition import process_text
|
5 |
+
from utils import safe_dataframe
|
6 |
+
|
7 |
+
def show_to_UI(file):
|
8 |
+
"""Processes the uploaded file and extracts medical data."""
|
9 |
+
processor = FileProcessor()
|
10 |
+
text = processor.process(file.name) # Read content
|
11 |
+
output = process_text(text) # Perform entity recognition
|
12 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
metadata = output["metadata"]
|
|
|
|
|
14 |
|
15 |
+
# Convert extracted data safely
|
16 |
+
highs = safe_dataframe(output["reds"], "high")
|
17 |
+
lows = safe_dataframe(output["reds"], "low")
|
18 |
+
labtests = safe_dataframe(output, "lab_tests")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \
|
21 |
f"**Age:** {metadata['age']}\n\n" \
|
|
|
23 |
f"**Lab Name:** {metadata['lab_name']}\n\n" \
|
24 |
f"**Report Date:** {metadata['report_date']}"
|
25 |
|
26 |
+
print(f"Processed report for {metadata['patient_name']}")
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
return metadata_str, highs, lows, labtests, output
|
29 |
|
30 |
+
# β
Gradio Interface
|
31 |
with gr.Blocks() as demo:
|
32 |
+
gr.Markdown("# π₯ Medical Lab Report Processor")
|
33 |
|
34 |
with gr.Row():
|
35 |
+
pdf_input = gr.File(label="π Upload Report")
|
36 |
submit_btn = gr.Button("Process")
|
37 |
|
38 |
+
metadata_output = gr.Markdown("**Patient Name: Loading...**")
|
39 |
+
|
40 |
with gr.Row():
|
41 |
+
high_output = gr.Dataframe(label="πΊ High Values")
|
42 |
+
low_output = gr.Dataframe(label="π» Low Values")
|
43 |
+
|
44 |
+
lab_test_output = gr.Dataframe(label="π Lab Test Results")
|
45 |
+
output_JSON = gr.JSON(label="π Extracted Report")
|
46 |
+
|
47 |
+
submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output, lab_test_output, output_JSON])
|
48 |
+
|
49 |
+
demo.launch()
|
50 |
+
|
51 |
|
|
|
|