Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import pdfplumber
|
3 |
from PIL import Image
|
4 |
-
import pytesseract
|
5 |
import io
|
6 |
import re
|
7 |
import random
|
@@ -23,28 +22,14 @@ def extract_text_from_pdf(file_bytes):
|
|
23 |
page_text = page.extract_text()
|
24 |
if page_text:
|
25 |
text += page_text + "\n"
|
26 |
-
#
|
27 |
-
if not text.strip():
|
28 |
-
text = ocr_pdf(file_bytes)
|
29 |
return text
|
30 |
except Exception as e:
|
31 |
return ""
|
32 |
|
33 |
-
def ocr_pdf(file_bytes):
|
34 |
-
text = ""
|
35 |
-
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
36 |
-
for page in pdf.pages:
|
37 |
-
# Convert page to image
|
38 |
-
pil_image = page.to_image(resolution=300).original
|
39 |
-
# OCR
|
40 |
-
page_text = pytesseract.image_to_string(pil_image)
|
41 |
-
text += page_text + "\n"
|
42 |
-
return text
|
43 |
-
|
44 |
def extract_text_from_image(file_bytes):
|
45 |
-
|
46 |
-
|
47 |
-
return text
|
48 |
|
49 |
def extract_text_from_txt(file_bytes):
|
50 |
try:
|
@@ -216,6 +201,7 @@ def main_process(file, question_type, num_questions):
|
|
216 |
if fname.endswith(".pdf"):
|
217 |
extracted_text = extract_text_from_pdf(file_bytes)
|
218 |
elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
|
|
|
219 |
extracted_text = extract_text_from_image(file_bytes)
|
220 |
elif fname.endswith(".txt"):
|
221 |
extracted_text = extract_text_from_txt(file_bytes)
|
@@ -285,3 +271,4 @@ with gr.Blocks(css="""
|
|
285 |
|
286 |
if __name__ == "__main__":
|
287 |
demo.launch()
|
|
|
|
1 |
import gradio as gr
|
2 |
import pdfplumber
|
3 |
from PIL import Image
|
|
|
4 |
import io
|
5 |
import re
|
6 |
import random
|
|
|
22 |
page_text = page.extract_text()
|
23 |
if page_text:
|
24 |
text += page_text + "\n"
|
25 |
+
# Do not fallback on OCR because pytesseract requires system installation
|
|
|
|
|
26 |
return text
|
27 |
except Exception as e:
|
28 |
return ""
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def extract_text_from_image(file_bytes):
|
31 |
+
# OCR disabled due to system dependencies on Tesseract
|
32 |
+
return "OCR not supported in this environment. Please upload a PDF or TXT file containing selectable text."
|
|
|
33 |
|
34 |
def extract_text_from_txt(file_bytes):
|
35 |
try:
|
|
|
201 |
if fname.endswith(".pdf"):
|
202 |
extracted_text = extract_text_from_pdf(file_bytes)
|
203 |
elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
|
204 |
+
# OCR unsupported fallback message
|
205 |
extracted_text = extract_text_from_image(file_bytes)
|
206 |
elif fname.endswith(".txt"):
|
207 |
extracted_text = extract_text_from_txt(file_bytes)
|
|
|
271 |
|
272 |
if __name__ == "__main__":
|
273 |
demo.launch()
|
274 |
+
|