Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -133,6 +133,7 @@ from starlette.responses import RedirectResponse
|
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
135 |
import os
|
|
|
136 |
# Initialize Tika for DOCX & PPTX parsing
|
137 |
tika.initVM()
|
138 |
|
@@ -159,10 +160,11 @@ def validate_file_type(file):
|
|
159 |
return None
|
160 |
return "β Invalid file format!"
|
161 |
|
|
|
162 |
# β
Extract Text from PDF
|
163 |
def extract_text_from_pdf(file_bytes):
|
164 |
try:
|
165 |
-
doc =
|
166 |
return "\n".join([page.get_text() for page in doc])
|
167 |
except Exception as e:
|
168 |
print(f"β PDF Extraction Error: {e}") # Log error
|
|
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
135 |
import os
|
136 |
+
import pymupdf
|
137 |
# Initialize Tika for DOCX & PPTX parsing
|
138 |
tika.initVM()
|
139 |
|
|
|
160 |
return None
|
161 |
return "β Invalid file format!"
|
162 |
|
163 |
+
# β
Extract Text from PDF
|
164 |
# β
Extract Text from PDF
|
165 |
def extract_text_from_pdf(file_bytes):
|
166 |
try:
|
167 |
+
doc = pymupdf.open(stream=file_bytes, filetype="pdf") # Use pymupdf.open()
|
168 |
return "\n".join([page.get_text() for page in doc])
|
169 |
except Exception as e:
|
170 |
print(f"β PDF Extraction Error: {e}") # Log error
|