mocktestgen commited on
Commit
cf6fac4
Β·
verified Β·
1 Parent(s): facb671

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -18
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import pdfplumber
3
  from PIL import Image
4
- import pytesseract
5
  import io
6
  import re
7
  import random
@@ -23,28 +22,14 @@ def extract_text_from_pdf(file_bytes):
23
  page_text = page.extract_text()
24
  if page_text:
25
  text += page_text + "\n"
26
- # If extracted text is empty, fallback to OCR per page
27
- if not text.strip():
28
- text = ocr_pdf(file_bytes)
29
  return text
30
  except Exception as e:
31
  return ""
32
 
33
- def ocr_pdf(file_bytes):
34
- text = ""
35
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
36
- for page in pdf.pages:
37
- # Convert page to image
38
- pil_image = page.to_image(resolution=300).original
39
- # OCR
40
- page_text = pytesseract.image_to_string(pil_image)
41
- text += page_text + "\n"
42
- return text
43
-
44
  def extract_text_from_image(file_bytes):
45
- image = Image.open(io.BytesIO(file_bytes))
46
- text = pytesseract.image_to_string(image)
47
- return text
48
 
49
  def extract_text_from_txt(file_bytes):
50
  try:
@@ -216,6 +201,7 @@ def main_process(file, question_type, num_questions):
216
  if fname.endswith(".pdf"):
217
  extracted_text = extract_text_from_pdf(file_bytes)
218
  elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
 
219
  extracted_text = extract_text_from_image(file_bytes)
220
  elif fname.endswith(".txt"):
221
  extracted_text = extract_text_from_txt(file_bytes)
@@ -285,3 +271,4 @@ with gr.Blocks(css="""
285
 
286
  if __name__ == "__main__":
287
  demo.launch()
 
 
1
  import gradio as gr
2
  import pdfplumber
3
  from PIL import Image
 
4
  import io
5
  import re
6
  import random
 
22
  page_text = page.extract_text()
23
  if page_text:
24
  text += page_text + "\n"
25
+ # Do not fallback on OCR because pytesseract requires system installation
 
 
26
  return text
27
  except Exception as e:
28
  return ""
29
 
 
 
 
 
 
 
 
 
 
 
 
30
  def extract_text_from_image(file_bytes):
31
+ # OCR disabled due to system dependencies on Tesseract
32
+ return "OCR not supported in this environment. Please upload a PDF or TXT file containing selectable text."
 
33
 
34
  def extract_text_from_txt(file_bytes):
35
  try:
 
201
  if fname.endswith(".pdf"):
202
  extracted_text = extract_text_from_pdf(file_bytes)
203
  elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
204
+ # OCR unsupported fallback message
205
  extracted_text = extract_text_from_image(file_bytes)
206
  elif fname.endswith(".txt"):
207
  extracted_text = extract_text_from_txt(file_bytes)
 
271
 
272
  if __name__ == "__main__":
273
  demo.launch()
274
+