Vishwas1 commited on
Commit
5c27db2
·
verified ·
1 Parent(s): 0d509f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -9
app.py CHANGED
@@ -1,10 +1,14 @@
1
- import fitz # PyMuPDF
2
  from PIL import Image
3
  import pytesseract
 
 
 
 
4
 
5
  def extract_images_from_pdf(pdf_path):
6
  """
7
- Extract images from the PDF file using PyMuPDF.
8
  """
9
  images = []
10
  document = fitz.open(pdf_path)
@@ -17,7 +21,7 @@ def extract_images_from_pdf(pdf_path):
17
 
18
  def perform_ocr_on_images(images):
19
  """
20
- Perform OCR on the extracted images.
21
  """
22
  ocr_results = []
23
  for img in images:
@@ -25,15 +29,23 @@ def perform_ocr_on_images(images):
25
  ocr_results.append(text)
26
  return "\n".join(ocr_results)
27
 
28
- def ocr_marathi_from_pdf(pdf_path):
29
  """
30
- Main function to handle Marathi OCR from a PDF.
31
  """
32
- images = extract_images_from_pdf(pdf_path)
33
  ocr_text = perform_ocr_on_images(images)
34
  return ocr_text
35
 
 
 
 
 
 
 
 
 
 
36
  if __name__ == "__main__":
37
- pdf_path = "path/to/your/marathi.pdf" # Replace with your PDF file path
38
- ocr_text = ocr_marathi_from_pdf(pdf_path)
39
- print(ocr_text)
 
1
+ import fitz # PyMuPDF for PDF processing
2
  from PIL import Image
3
  import pytesseract
4
+ import gradio as gr
5
+
6
+ # Ensure Tesseract is configured with Marathi language support
7
+ # Install Marathi language: sudo apt-get install tesseract-ocr-mar
8
 
9
  def extract_images_from_pdf(pdf_path):
10
  """
11
+ Extract images from a PDF file using PyMuPDF.
12
  """
13
  images = []
14
  document = fitz.open(pdf_path)
 
21
 
22
  def perform_ocr_on_images(images):
23
  """
24
+ Perform OCR on the extracted images using pytesseract for Marathi text.
25
  """
26
  ocr_results = []
27
  for img in images:
 
29
  ocr_results.append(text)
30
  return "\n".join(ocr_results)
31
 
32
+ def ocr_marathi_from_pdf(pdf_file):
33
  """
34
+ Main function to handle Marathi OCR from a PDF file.
35
  """
36
+ images = extract_images_from_pdf(pdf_file.name) # Use the file path from the upload
37
  ocr_text = perform_ocr_on_images(images)
38
  return ocr_text
39
 
40
+ # Define the Gradio interface
41
+ interface = gr.Interface(
42
+ fn=ocr_marathi_from_pdf,
43
+ inputs=gr.File(type="file", label="Upload Marathi PDF"),
44
+ outputs=gr.Textbox(label="Extracted Marathi Text"),
45
+ title="Marathi PDF OCR",
46
+ description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
47
+ )
48
+
49
  if __name__ == "__main__":
50
+ interface.launch()
51
+