Spaces:

Vishwas1
/

PDF2Marathi

Sleeping

Vishwas1 commited on Dec 7, 2024

Commit

5c27db2

verified ·

1 Parent(s): 0d509f3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,10 +1,14 @@
-import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
 def extract_images_from_pdf(pdf_path):
     """
-    Extract images from the PDF file using PyMuPDF.
     """
     images = []
     document = fitz.open(pdf_path)
@@ -17,7 +21,7 @@ def extract_images_from_pdf(pdf_path):
 def perform_ocr_on_images(images):
     """
-    Perform OCR on the extracted images.
     """
     ocr_results = []
     for img in images:
@@ -25,15 +29,23 @@ def perform_ocr_on_images(images):
         ocr_results.append(text)
     return "\n".join(ocr_results)
-def ocr_marathi_from_pdf(pdf_path):
     """
-    Main function to handle Marathi OCR from a PDF.
     """
-    images = extract_images_from_pdf(pdf_path)
     ocr_text = perform_ocr_on_images(images)
     return ocr_text
 if __name__ == "__main__":
-    pdf_path = "path/to/your/marathi.pdf"  # Replace with your PDF file path
-    ocr_text = ocr_marathi_from_pdf(pdf_path)
-    print(ocr_text)

+import fitz  # PyMuPDF for PDF processing
 from PIL import Image
 import pytesseract
+import gradio as gr
+# Ensure Tesseract is configured with Marathi language support
+# Install Marathi language: sudo apt-get install tesseract-ocr-mar
 def extract_images_from_pdf(pdf_path):
     """
+    Extract images from a PDF file using PyMuPDF.
     """
     images = []
     document = fitz.open(pdf_path)
 def perform_ocr_on_images(images):
     """
+    Perform OCR on the extracted images using pytesseract for Marathi text.
     """
     ocr_results = []
     for img in images:
         ocr_results.append(text)
     return "\n".join(ocr_results)
+def ocr_marathi_from_pdf(pdf_file):
     """
+    Main function to handle Marathi OCR from a PDF file.
     """
+    images = extract_images_from_pdf(pdf_file.name)  # Use the file path from the upload
     ocr_text = perform_ocr_on_images(images)
     return ocr_text
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=ocr_marathi_from_pdf,
+    inputs=gr.File(type="file", label="Upload Marathi PDF"),
+    outputs=gr.Textbox(label="Extracted Marathi Text"),
+    title="Marathi PDF OCR",
+    description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
+)
 if __name__ == "__main__":
+    interface.launch()