Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -417,7 +417,6 @@
|
|
417 |
# else:
|
418 |
# st.sidebar.info("Upload PDFs to begin your QA journey.")
|
419 |
|
420 |
-
|
421 |
import os
|
422 |
import streamlit as st
|
423 |
import fitz # PyMuPDF
|
@@ -449,13 +448,14 @@ def load_model():
|
|
449 |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
|
450 |
return HuggingFacePipeline(pipeline=pipe)
|
451 |
|
452 |
-
# --- Extract PDF Text ---
|
453 |
def read_pdf(file):
|
454 |
try:
|
455 |
doc = fitz.open(stream=file.read(), filetype="pdf")
|
456 |
text = ""
|
|
|
457 |
for page in doc:
|
458 |
-
text += page.get_text()
|
459 |
return text.strip()
|
460 |
except Exception as e:
|
461 |
logging.error(f"Failed to extract text: {e}")
|
@@ -558,3 +558,4 @@ if uploaded_file:
|
|
558 |
st.error("⚠️ No text could be extracted from the PDF. Try another file.")
|
559 |
else:
|
560 |
st.info("Upload a PDF to begin.")
|
|
|
|
417 |
# else:
|
418 |
# st.sidebar.info("Upload PDFs to begin your QA journey.")
|
419 |
|
|
|
420 |
import os
|
421 |
import streamlit as st
|
422 |
import fitz # PyMuPDF
|
|
|
448 |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
|
449 |
return HuggingFacePipeline(pipeline=pipe)
|
450 |
|
451 |
+
# --- Extract PDF Text (Improved) ---
|
452 |
def read_pdf(file):
|
453 |
try:
|
454 |
doc = fitz.open(stream=file.read(), filetype="pdf")
|
455 |
text = ""
|
456 |
+
# Extract text from each page
|
457 |
for page in doc:
|
458 |
+
text += page.get_text("text") # You can also use "dict" for structured text or "html"
|
459 |
return text.strip()
|
460 |
except Exception as e:
|
461 |
logging.error(f"Failed to extract text: {e}")
|
|
|
558 |
st.error("⚠️ No text could be extracted from the PDF. Try another file.")
|
559 |
else:
|
560 |
st.info("Upload a PDF to begin.")
|
561 |
+
|