|
import gradio as gr |
|
import pandas as pd |
|
import fitz |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
import os |
|
import base64 |
|
from google import genai |
|
from google.genai import types |
|
|
|
google_api=os.getenv("google_api") |
|
|
|
|
|
|
|
|
|
def read_pdf(pdf_path): |
|
text = "" |
|
doc = fitz.open(pdf_path) |
|
|
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
page_text = page.get_text("text").strip() |
|
|
|
|
|
images = page.get_images(full=True) |
|
|
|
ocr_text = "" |
|
if images: |
|
print(f"Page {page_num + 1} contains images, performing OCR...") |
|
img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1) |
|
|
|
for img in img_pages: |
|
ocr_text += pytesseract.image_to_string(img).strip() + "\n" |
|
|
|
|
|
combined_text = f"{page_text}\n{ocr_text}".strip() |
|
|
|
if combined_text: |
|
text += combined_text + "\n\n" |
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate(extracted_text): |
|
client = genai.Client( |
|
api_key=google_api, |
|
) |
|
|
|
model = "gemini-2.0-flash" |
|
contents = [ |
|
types.Content( |
|
role="user", |
|
parts=[ |
|
types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR. |
|
There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges. |
|
Please correct the errors and extract both metadata and structured lab test data. |
|
ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST |
|
AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH |
|
Return the output in structured JSON format with all the information in lowercase to standardization. |
|
And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified |
|
|
|
|
|
Extracted Text: |
|
Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 % |
|
|
|
Expected JSON format: |
|
{ |
|
\"metadata\": { |
|
\"patient_name\": \"Prasahsst Pawar\", |
|
\"age\": \"20\", |
|
\"gender\": \"Male\", |
|
\"lab_name\": \"XYZ Diagnostics\", |
|
\"report_date\": \"05-03-2025\" |
|
}, |
|
\"lab_tests\": [ |
|
{ |
|
\"test_name\": \"hemoglobin\", |
|
\"value\": \"14.2\", |
|
\"unit\": \"g/dL\", |
|
\"reference_range\": \"13.5 - 17.5 g/dL\" |
|
}, |
|
{ |
|
\"test_name\": \"rbc count\", |
|
\"value\": \"5.2\", |
|
\"unit\": \"million/cu mm\", |
|
\"reference_range\": \"4.1-5.1\" |
|
}, |
|
{ |
|
\"test_name\": \"glucose\", |
|
\"value\": \"65\", |
|
\"unit\": \"mg/dL\", |
|
\"reference_range\": \"70 - 110 mg/dL\" |
|
} |
|
], |
|
\"reds\":{ |
|
\"low\":[\"glucose\"], |
|
\"high\":[\"rbc count\"] |
|
} |
|
}"""), |
|
], |
|
), |
|
types.Content( |
|
role="model", |
|
parts=[ |
|
types.Part.from_text(text="""{ |
|
\"lab_tests\": [ |
|
{ |
|
\"reference_range\": \"12.0 - 17.0\", |
|
\"test_name\": \"haemoglobin\", |
|
\"unit\": \"gms/dl\", |
|
\"value\": \"14\" |
|
}, |
|
{ |
|
\"reference_range\": \"4.1-5.1\", |
|
\"test_name\": \"rbc count\", |
|
\"unit\": \"mill/cu mm\", |
|
\"value\": \"4.4\" |
|
}, |
|
{ |
|
\"reference_range\": \"32.0 - 47.0\", |
|
\"test_name\": \"haematocrit (pcv)\", |
|
\"unit\": \"%\", |
|
\"value\": \"30\" |
|
}, |
|
{ |
|
\"reference_range\": \"76.0 - 100.0\", |
|
\"test_name\": \"mcv\", |
|
\"unit\": \"fl\", |
|
\"value\": \"78\" |
|
}, |
|
{ |
|
\"reference_range\": \"26.0-32.0\", |
|
\"test_name\": \"mch\", |
|
\"unit\": \"pg\", |
|
\"value\": \"32.46\" |
|
}, |
|
{ |
|
\"reference_range\": \"31.5-34.5\", |
|
\"test_name\": \"mchc\", |
|
\"unit\": \"%\", |
|
\"value\": \"32.8\" |
|
}, |
|
{ |
|
\"reference_range\": \"11.6-15.0\", |
|
\"test_name\": \"rdw\", |
|
\"unit\": \"%\", |
|
\"value\": \"13.9\" |
|
}, |
|
{ |
|
\"reference_range\": \"6.8- 12.6\", |
|
\"test_name\": \"mpv\", |
|
\"unit\": \"fn\", |
|
\"value\": \"11.2\" |
|
}, |
|
{ |
|
\"reference_range\": \"4000 - 11000\", |
|
\"test_name\": \"wbc count\", |
|
\"unit\": \"/cu mm\", |
|
\"value\": \"4567\" |
|
}, |
|
{ |
|
\"reference_range\": \"40-70\", |
|
\"test_name\": \"neutrophils\", |
|
\"unit\": \"%\", |
|
\"value\": \"56\" |
|
}, |
|
{ |
|
\"reference_range\": \"20.0- 45.0\", |
|
\"test_name\": \"lymphocytes\", |
|
\"unit\": \"%\", |
|
\"value\": \"20\" |
|
}, |
|
{ |
|
\"reference_range\": \"0-6\", |
|
\"test_name\": \"eosinophils\", |
|
\"unit\": \"%\", |
|
\"value\": \"4\" |
|
}, |
|
{ |
|
\"reference_range\": \"2-10\", |
|
\"test_name\": \"monocytes\", |
|
\"unit\": \"%\", |
|
\"value\": \"5\" |
|
} |
|
], |
|
\"metadata\": { |
|
\"age\": \"40\", |
|
\"gender\": \"male\", |
|
\"lab_name\": \"sanjeevan hospital\", |
|
\"patient_name\": \"amar shaha\", |
|
\"report_date\": \"09-jul-20\" |
|
}, |
|
\"reds\": { |
|
\"high\": [ |
|
\"mch\" |
|
], |
|
\"low\": [ |
|
\"haematocrit (pcv)\" |
|
] |
|
} |
|
}"""), |
|
], |
|
), |
|
types.Content( |
|
role="user", |
|
parts=[ |
|
types.Part.from_text(text=extracted_text), |
|
], |
|
), |
|
] |
|
generate_content_config = types.GenerateContentConfig( |
|
temperature=1, |
|
top_p=0.95, |
|
top_k=40, |
|
max_output_tokens=8192, |
|
response_mime_type="application/json", |
|
response_schema=genai.types.Schema( |
|
type = genai.types.Type.OBJECT, |
|
enum = [], |
|
required = ["metadata", "lab_tests", "reds"], |
|
properties = { |
|
"metadata": genai.types.Schema( |
|
type = genai.types.Type.OBJECT, |
|
enum = [], |
|
required = ["patient_name", "age", "gender", "lab_name", "report_date"], |
|
properties = { |
|
"patient_name": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"age": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"gender": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"lab_name": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"report_date": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
}, |
|
), |
|
"lab_tests": genai.types.Schema( |
|
type = genai.types.Type.ARRAY, |
|
items = genai.types.Schema( |
|
type = genai.types.Type.OBJECT, |
|
enum = [], |
|
required = ["test_name", "value", "unit", "reference_range"], |
|
properties = { |
|
"test_name": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"value": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"unit": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
"reference_range": genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
}, |
|
), |
|
), |
|
"reds": genai.types.Schema( |
|
type = genai.types.Type.OBJECT, |
|
enum = [], |
|
required = ["low", "high"], |
|
properties = { |
|
"low": genai.types.Schema( |
|
type = genai.types.Type.ARRAY, |
|
items = genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
), |
|
"high": genai.types.Schema( |
|
type = genai.types.Type.ARRAY, |
|
items = genai.types.Schema( |
|
type = genai.types.Type.STRING, |
|
), |
|
), |
|
}, |
|
), |
|
}, |
|
), |
|
system_instruction=[ |
|
types.Part.from_text(text="""Always return the output as JSON only"""), |
|
], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = client.models.generate_content( |
|
model=model, |
|
contents=contents, |
|
config=generate_content_config, |
|
) |
|
|
|
json_response = response.text |
|
parsed_json = json.loads(json_response) |
|
|
|
return parsed_json |
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_pdf(pdf): |
|
text = read_pdf(pdf) |
|
|
|
output = generate(text) |
|
|
|
return output |
|
|
|
|
|
def show_to_UI(pdf): |
|
output = process_pdf(pdf) |
|
|
|
|
|
metadata = output["metadata"] |
|
|
|
|
|
|
|
try: |
|
labtests = pd.DataFrame(output["lab_tests"],) |
|
except Exception as e: |
|
print(f"Error creating lab tests DataFrame: {e}") |
|
labtests = pd.DataFrame() |
|
|
|
try: |
|
highs = pd.DataFrame(output["reds"]["high"],index=True) |
|
except Exception as e: |
|
print(f"Error creating highs DataFrame: {e}") |
|
highs = pd.DataFrame() |
|
|
|
try: |
|
lows = pd.DataFrame(output["reds"]["low"],) |
|
except Exception as e: |
|
print(f"Error creating lowss DataFrame: {e}") |
|
lows = pd.DataFrame() |
|
|
|
metadata_str = f"**Patient Name:** {metadata['patient_name']}\n\n" \ |
|
f"**Age:** {metadata['age']}\n\n" \ |
|
f"**Gender:** {metadata['gender']}\n\n" \ |
|
f"**Lab Name:** {metadata['lab_name']}\n\n" \ |
|
f"**Report Date:** {metadata['report_date']}" |
|
|
|
|
|
|
|
return metadata_str,highs,lows, labtests,output |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Medical Lab Report Processor") |
|
|
|
with gr.Row(): |
|
pdf_input = gr.File(label="Upload PDF Report") |
|
submit_btn = gr.Button("Process") |
|
|
|
metadata_output = gr.Markdown("**Patient Name: Prashasst...**") |
|
with gr.Row(): |
|
high_output = gr.Dataframe(label="High Values") |
|
low_output = gr.Dataframe(label="Low Values") |
|
lab_test_output = gr.Dataframe(label="Lab Test Results") |
|
output_JSON = gr.JSON(label="Extracted Report") |
|
|
|
submit_btn.click(show_to_UI, inputs=[pdf_input], outputs=[metadata_output, high_output, low_output,lab_test_output, output_JSON]) |
|
demo.launch(debug=True,share=True) |