Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -101,12 +101,11 @@
|
|
101 |
# if __name__ == "__main__":
|
102 |
# main()
|
103 |
|
104 |
-
|
105 |
import os
|
106 |
-
import PyPDF2
|
107 |
import logging
|
108 |
import math
|
109 |
import streamlit as st
|
|
|
110 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
111 |
from langchain_community.document_loaders import PDFMinerLoader
|
112 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -135,14 +134,14 @@ base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
|
135 |
# Helper Functions
|
136 |
|
137 |
def extract_text_from_pdf(file_path):
|
138 |
-
"""Extract text from a PDF using
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
except Exception as e:
|
147 |
logging.error(f"Error reading PDF {file_path}: {e}")
|
148 |
return None
|
@@ -161,7 +160,7 @@ def data_ingestion():
|
|
161 |
file_path = os.path.join(uploaded_files_dir, filename)
|
162 |
logging.info(f"Processing file: {file_path}")
|
163 |
|
164 |
-
# Extract text using
|
165 |
text = extract_text_from_pdf(file_path)
|
166 |
|
167 |
if text:
|
|
|
101 |
# if __name__ == "__main__":
|
102 |
# main()
|
103 |
|
|
|
104 |
import os
|
|
|
105 |
import logging
|
106 |
import math
|
107 |
import streamlit as st
|
108 |
+
import fitz # PyMuPDF
|
109 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
110 |
from langchain_community.document_loaders import PDFMinerLoader
|
111 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
134 |
# Helper Functions
|
135 |
|
136 |
def extract_text_from_pdf(file_path):
|
137 |
+
"""Extract text from a PDF using PyMuPDF (fitz)."""
|
138 |
try:
|
139 |
+
doc = fitz.open(file_path)
|
140 |
+
text = ""
|
141 |
+
for page_num in range(doc.page_count):
|
142 |
+
page = doc.load_page(page_num)
|
143 |
+
text += page.get_text("text")
|
144 |
+
return text
|
145 |
except Exception as e:
|
146 |
logging.error(f"Error reading PDF {file_path}: {e}")
|
147 |
return None
|
|
|
160 |
file_path = os.path.join(uploaded_files_dir, filename)
|
161 |
logging.info(f"Processing file: {file_path}")
|
162 |
|
163 |
+
# Extract text using PyMuPDF
|
164 |
text = extract_text_from_pdf(file_path)
|
165 |
|
166 |
if text:
|