Vishwas1 commited on
Commit
0d509f3
·
verified ·
1 Parent(s): b948fd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -51
app.py CHANGED
@@ -1,28 +1,18 @@
1
- import os
2
- from popplerqt5 import Poppler
3
- from PyQt5.QtCore import QByteArray
4
- import pytesseract
5
  from PIL import Image
6
-
7
- # Ensure pytesseract is configured for Marathi language
8
- pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract" # Update if needed
9
- marathi_lang = "mar" # Ensure Marathi language is installed in Tesseract
10
 
11
  def extract_images_from_pdf(pdf_path):
12
  """
13
- Extract images from the PDF file using python-poppler-qt5.
14
  """
15
- document = Poppler.Document.load(pdf_path)
16
- if not document:
17
- raise ValueError(f"Unable to open {pdf_path}")
18
-
19
  images = []
20
- for i in range(document.numPages()):
21
- page = document.page(i)
22
- if page:
23
- image = page.renderToImage(300, 300) # DPI: 300x300 for better OCR
24
- images.append(image)
25
-
26
  return images
27
 
28
  def perform_ocr_on_images(images):
@@ -30,44 +20,20 @@ def perform_ocr_on_images(images):
30
  Perform OCR on the extracted images.
31
  """
32
  ocr_results = []
33
- for i, image in enumerate(images):
34
- # Convert Qt Image to PIL Image
35
- pil_image = Image.fromqimage(image)
36
- text = pytesseract.image_to_string(pil_image, lang=marathi_lang)
37
  ocr_results.append(text)
38
- print(f"OCR for Page {i + 1}: {text}")
39
-
40
- return ocr_results
41
 
42
  def ocr_marathi_from_pdf(pdf_path):
43
  """
44
  Main function to handle Marathi OCR from a PDF.
45
  """
46
- # Step 1: Extract images from the PDF
47
- print("Extracting images from PDF...")
48
  images = extract_images_from_pdf(pdf_path)
49
-
50
- # Step 2: Perform OCR on the extracted images
51
- print("Performing OCR on images...")
52
- ocr_results = perform_ocr_on_images(images)
53
-
54
- # Combine results
55
- combined_text = "\n".join(ocr_results)
56
- print(f"Combined OCR Text: {combined_text}")
57
-
58
- return combined_text
59
 
60
  if __name__ == "__main__":
61
- pdf_path = "path/to/marathi/pdf.pdf" # Replace with the path to your PDF
62
- if not os.path.exists(pdf_path):
63
- print(f"PDF file not found: {pdf_path}")
64
- else:
65
- print("Processing Marathi PDF...")
66
- ocr_text = ocr_marathi_from_pdf(pdf_path)
67
- with open("output.txt", "w", encoding="utf-8") as f:
68
- f.write(ocr_text)
69
- print("OCR text saved to output.txt")
70
-
71
-
72
-
73
-
 
1
+ import fitz # PyMuPDF
 
 
 
2
  from PIL import Image
3
+ import pytesseract
 
 
 
4
 
5
  def extract_images_from_pdf(pdf_path):
6
  """
7
+ Extract images from the PDF file using PyMuPDF.
8
  """
 
 
 
 
9
  images = []
10
+ document = fitz.open(pdf_path)
11
+ for page_number in range(len(document)):
12
+ page = document.load_page(page_number)
13
+ pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI
14
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
15
+ images.append(img)
16
  return images
17
 
18
  def perform_ocr_on_images(images):
 
20
  Perform OCR on the extracted images.
21
  """
22
  ocr_results = []
23
+ for img in images:
24
+ text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi
 
 
25
  ocr_results.append(text)
26
+ return "\n".join(ocr_results)
 
 
27
 
28
  def ocr_marathi_from_pdf(pdf_path):
29
  """
30
  Main function to handle Marathi OCR from a PDF.
31
  """
 
 
32
  images = extract_images_from_pdf(pdf_path)
33
+ ocr_text = perform_ocr_on_images(images)
34
+ return ocr_text
 
 
 
 
 
 
 
 
35
 
36
  if __name__ == "__main__":
37
+ pdf_path = "path/to/your/marathi.pdf" # Replace with your PDF file path
38
+ ocr_text = ocr_marathi_from_pdf(pdf_path)
39
+ print(ocr_text)