pradeepsengarr commited on
Commit
6ccf2cb
·
verified ·
1 Parent(s): 3875c87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -101,12 +101,11 @@
101
  # if __name__ == "__main__":
102
  # main()
103
 
104
-
105
  import os
106
- import PyPDF2
107
  import logging
108
  import math
109
  import streamlit as st
 
110
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
111
  from langchain_community.document_loaders import PDFMinerLoader
112
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -135,14 +134,14 @@ base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
135
  # Helper Functions
136
 
137
  def extract_text_from_pdf(file_path):
138
- """Extract text from a PDF using PyPDF2."""
139
  try:
140
- with open(file_path, 'rb') as file:
141
- reader = PyPDF2.PdfReader(file)
142
- text = ""
143
- for page in range(len(reader.pages)):
144
- text += reader.pages[page].extract_text()
145
- return text
146
  except Exception as e:
147
  logging.error(f"Error reading PDF {file_path}: {e}")
148
  return None
@@ -161,7 +160,7 @@ def data_ingestion():
161
  file_path = os.path.join(uploaded_files_dir, filename)
162
  logging.info(f"Processing file: {file_path}")
163
 
164
- # Extract text using PyPDF2
165
  text = extract_text_from_pdf(file_path)
166
 
167
  if text:
 
101
  # if __name__ == "__main__":
102
  # main()
103
 
 
104
  import os
 
105
  import logging
106
  import math
107
  import streamlit as st
108
+ import fitz # PyMuPDF
109
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
110
  from langchain_community.document_loaders import PDFMinerLoader
111
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
134
  # Helper Functions
135
 
136
  def extract_text_from_pdf(file_path):
137
+ """Extract text from a PDF using PyMuPDF (fitz)."""
138
  try:
139
+ doc = fitz.open(file_path)
140
+ text = ""
141
+ for page_num in range(doc.page_count):
142
+ page = doc.load_page(page_num)
143
+ text += page.get_text("text")
144
+ return text
145
  except Exception as e:
146
  logging.error(f"Error reading PDF {file_path}: {e}")
147
  return None
 
160
  file_path = os.path.join(uploaded_files_dir, filename)
161
  logging.info(f"Processing file: {file_path}")
162
 
163
+ # Extract text using PyMuPDF
164
  text = extract_text_from_pdf(file_path)
165
 
166
  if text: