pradeepsengarr commited on
Commit
e428e3e
·
verified ·
1 Parent(s): 920b3d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -417,7 +417,6 @@
417
  # else:
418
  # st.sidebar.info("Upload PDFs to begin your QA journey.")
419
 
420
-
421
  import os
422
  import streamlit as st
423
  import fitz # PyMuPDF
@@ -449,13 +448,14 @@ def load_model():
449
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
450
  return HuggingFacePipeline(pipeline=pipe)
451
 
452
- # --- Extract PDF Text ---
453
  def read_pdf(file):
454
  try:
455
  doc = fitz.open(stream=file.read(), filetype="pdf")
456
  text = ""
 
457
  for page in doc:
458
- text += page.get_text()
459
  return text.strip()
460
  except Exception as e:
461
  logging.error(f"Failed to extract text: {e}")
@@ -558,3 +558,4 @@ if uploaded_file:
558
  st.error("⚠️ No text could be extracted from the PDF. Try another file.")
559
  else:
560
  st.info("Upload a PDF to begin.")
 
 
417
  # else:
418
  # st.sidebar.info("Upload PDFs to begin your QA journey.")
419
 
 
420
  import os
421
  import streamlit as st
422
  import fitz # PyMuPDF
 
448
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
449
  return HuggingFacePipeline(pipeline=pipe)
450
 
451
+ # --- Extract PDF Text (Improved) ---
452
  def read_pdf(file):
453
  try:
454
  doc = fitz.open(stream=file.read(), filetype="pdf")
455
  text = ""
456
+ # Extract text from each page
457
  for page in doc:
458
+ text += page.get_text("text") # You can also use "dict" for structured text or "html"
459
  return text.strip()
460
  except Exception as e:
461
  logging.error(f"Failed to extract text: {e}")
 
558
  st.error("⚠️ No text could be extracted from the PDF. Try another file.")
559
  else:
560
  st.info("Upload a PDF to begin.")
561
+