Gary0205 commited on
Commit
28bc080
·
1 Parent(s): 0cb5651

Add OCR extraction app using TrOCR

Browse files
Files changed (1) hide show
  1. app.py +45 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
3
+ from pdf2image import convert_from_path
4
+ import pytesseract
5
+
6
+ # Load TrOCR Model from Hugging Face
7
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
8
+ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
9
+
10
+ # Function to extract text from PDF
11
+ def extract_text_from_pdf(pdf_path):
12
+ images = convert_from_path(pdf_path)
13
+ extracted_text = []
14
+
15
+ for img in images:
16
+ # Convert image to text using TrOCR
17
+ pixel_values = processor(images=img, return_tensors="pt").pixel_values
18
+ generated_ids = model.generate(pixel_values)
19
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
20
+
21
+ # Fallback to Tesseract if TrOCR fails
22
+ if not text.strip():
23
+ text = pytesseract.image_to_string(img)
24
+
25
+ extracted_text.append(text)
26
+
27
+ return "\n".join(extracted_text)
28
+
29
+ # Gradio Interface
30
+ def ocr_pipeline(pdf_file):
31
+ pdf_path = pdf_file.name
32
+ extracted_text = extract_text_from_pdf(pdf_path)
33
+ return extracted_text
34
+
35
+ iface = gr.Interface(
36
+ fn=ocr_pipeline,
37
+ inputs=gr.File(label="Upload PDF"),
38
+ outputs="text",
39
+ title="PDF Text Extraction using TrOCR"
40
+ )
41
+
42
+ # Run the Gradio App
43
+ if __name__ == "__main__":
44
+ iface.launch()
45
+