Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1 |
-
import fitz # PyMuPDF
|
2 |
from PIL import Image
|
3 |
import pytesseract
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def extract_images_from_pdf(pdf_path):
|
6 |
"""
|
7 |
-
Extract images from
|
8 |
"""
|
9 |
images = []
|
10 |
document = fitz.open(pdf_path)
|
@@ -17,7 +21,7 @@ def extract_images_from_pdf(pdf_path):
|
|
17 |
|
18 |
def perform_ocr_on_images(images):
|
19 |
"""
|
20 |
-
Perform OCR on the extracted images.
|
21 |
"""
|
22 |
ocr_results = []
|
23 |
for img in images:
|
@@ -25,15 +29,23 @@ def perform_ocr_on_images(images):
|
|
25 |
ocr_results.append(text)
|
26 |
return "\n".join(ocr_results)
|
27 |
|
28 |
-
def ocr_marathi_from_pdf(
|
29 |
"""
|
30 |
-
Main function to handle Marathi OCR from a PDF.
|
31 |
"""
|
32 |
-
images = extract_images_from_pdf(
|
33 |
ocr_text = perform_ocr_on_images(images)
|
34 |
return ocr_text
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
if __name__ == "__main__":
|
37 |
-
|
38 |
-
|
39 |
-
print(ocr_text)
|
|
|
1 |
+
import fitz # PyMuPDF for PDF processing
|
2 |
from PIL import Image
|
3 |
import pytesseract
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
# Ensure Tesseract is configured with Marathi language support
|
7 |
+
# Install Marathi language: sudo apt-get install tesseract-ocr-mar
|
8 |
|
9 |
def extract_images_from_pdf(pdf_path):
|
10 |
"""
|
11 |
+
Extract images from a PDF file using PyMuPDF.
|
12 |
"""
|
13 |
images = []
|
14 |
document = fitz.open(pdf_path)
|
|
|
21 |
|
22 |
def perform_ocr_on_images(images):
|
23 |
"""
|
24 |
+
Perform OCR on the extracted images using pytesseract for Marathi text.
|
25 |
"""
|
26 |
ocr_results = []
|
27 |
for img in images:
|
|
|
29 |
ocr_results.append(text)
|
30 |
return "\n".join(ocr_results)
|
31 |
|
32 |
+
def ocr_marathi_from_pdf(pdf_file):
|
33 |
"""
|
34 |
+
Main function to handle Marathi OCR from a PDF file.
|
35 |
"""
|
36 |
+
images = extract_images_from_pdf(pdf_file.name) # Use the file path from the upload
|
37 |
ocr_text = perform_ocr_on_images(images)
|
38 |
return ocr_text
|
39 |
|
40 |
+
# Define the Gradio interface
|
41 |
+
interface = gr.Interface(
|
42 |
+
fn=ocr_marathi_from_pdf,
|
43 |
+
inputs=gr.File(type="file", label="Upload Marathi PDF"),
|
44 |
+
outputs=gr.Textbox(label="Extracted Marathi Text"),
|
45 |
+
title="Marathi PDF OCR",
|
46 |
+
description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
|
47 |
+
)
|
48 |
+
|
49 |
if __name__ == "__main__":
|
50 |
+
interface.launch()
|
51 |
+
|
|