Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -215,7 +215,8 @@ import math
|
|
215 |
import streamlit as st
|
216 |
import fitz # PyMuPDF
|
217 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
218 |
-
from langchain_community.document_loaders import PDFMinerLoader
|
|
|
219 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
220 |
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
221 |
from langchain_community.vectorstores import Chroma
|
@@ -253,46 +254,81 @@ def extract_outline_from_pdf(path):
|
|
253 |
return f"Could not preview PDF: {e}"
|
254 |
|
255 |
def data_ingestion():
|
|
|
256 |
try:
|
257 |
logging.info("Starting data ingestion")
|
|
|
258 |
if not os.path.exists(uploaded_files_dir):
|
259 |
os.makedirs(uploaded_files_dir)
|
260 |
|
261 |
-
documents = []
|
262 |
for filename in os.listdir(uploaded_files_dir):
|
263 |
if filename.endswith(".pdf"):
|
264 |
-
|
265 |
-
logging.info(f"
|
|
|
266 |
try:
|
267 |
-
loader =
|
268 |
loaded_docs = loader.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
for doc in loaded_docs:
|
270 |
-
if hasattr(doc, 'page_content'):
|
271 |
documents.append(doc)
|
|
|
|
|
|
|
272 |
except Exception as e:
|
273 |
-
logging.
|
|
|
274 |
|
275 |
if not documents:
|
276 |
-
|
277 |
return
|
278 |
|
|
|
|
|
|
|
279 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
280 |
texts = text_splitter.split_documents(documents)
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
db = None
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
287 |
if db is None:
|
288 |
-
db = Chroma.from_documents(
|
289 |
else:
|
290 |
-
db.add_documents(
|
|
|
291 |
db.persist()
|
292 |
-
logging.info("Data ingestion completed
|
|
|
293 |
except Exception as e:
|
294 |
-
logging.error(f"
|
295 |
-
|
|
|
296 |
|
297 |
def llm_pipeline():
|
298 |
pipe = pipeline(
|
|
|
215 |
import streamlit as st
|
216 |
import fitz # PyMuPDF
|
217 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
218 |
+
# from langchain_community.document_loaders import PDFMinerLoader
|
219 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
220 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
221 |
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
222 |
from langchain_community.vectorstores import Chroma
|
|
|
254 |
return f"Could not preview PDF: {e}"
|
255 |
|
256 |
def data_ingestion():
|
257 |
+
"""Load PDFs, validate content, and generate embeddings."""
|
258 |
try:
|
259 |
logging.info("Starting data ingestion")
|
260 |
+
|
261 |
if not os.path.exists(uploaded_files_dir):
|
262 |
os.makedirs(uploaded_files_dir)
|
263 |
|
264 |
+
documents = []
|
265 |
for filename in os.listdir(uploaded_files_dir):
|
266 |
if filename.endswith(".pdf"):
|
267 |
+
file_path = os.path.join(uploaded_files_dir, filename)
|
268 |
+
logging.info(f"Processing file: {file_path}")
|
269 |
+
|
270 |
try:
|
271 |
+
loader = PyMuPDFLoader(file_path)
|
272 |
loaded_docs = loader.load()
|
273 |
+
|
274 |
+
# Check if any content exists in loaded_docs
|
275 |
+
if not loaded_docs or len(loaded_docs[0].page_content.strip()) == 0:
|
276 |
+
logging.warning(f"No readable text found in {file_path}. Might be a scanned image or unsupported format.")
|
277 |
+
continue
|
278 |
+
|
279 |
for doc in loaded_docs:
|
280 |
+
if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
|
281 |
documents.append(doc)
|
282 |
+
else:
|
283 |
+
logging.warning(f"Skipping invalid document structure in {file_path}")
|
284 |
+
|
285 |
except Exception as e:
|
286 |
+
logging.error(f"Skipping {file_path}: {str(e)}")
|
287 |
+
continue
|
288 |
|
289 |
if not documents:
|
290 |
+
logging.error("No valid documents found to process.")
|
291 |
return
|
292 |
|
293 |
+
logging.info(f"Total valid documents: {len(documents)}")
|
294 |
+
|
295 |
+
# Proceed with splitting and embedding documents
|
296 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
297 |
texts = text_splitter.split_documents(documents)
|
298 |
|
299 |
+
logging.info(f"Total text chunks created: {len(texts)}")
|
300 |
+
|
301 |
+
if not texts:
|
302 |
+
logging.error("No valid text chunks to create embeddings.")
|
303 |
+
return
|
304 |
+
|
305 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
306 |
+
|
307 |
+
MAX_BATCH_SIZE = 5461
|
308 |
+
total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
|
309 |
+
|
310 |
+
logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
|
311 |
+
|
312 |
db = None
|
313 |
+
for i in range(total_batches):
|
314 |
+
batch_start = i * MAX_BATCH_SIZE
|
315 |
+
batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
|
316 |
+
text_batch = texts[batch_start:batch_end]
|
317 |
+
|
318 |
+
logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
|
319 |
+
|
320 |
if db is None:
|
321 |
+
db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
|
322 |
else:
|
323 |
+
db.add_documents(text_batch)
|
324 |
+
|
325 |
db.persist()
|
326 |
+
logging.info("Data ingestion completed successfully")
|
327 |
+
|
328 |
except Exception as e:
|
329 |
+
logging.error(f"Error during data ingestion: {str(e)}")
|
330 |
+
raise
|
331 |
+
|
332 |
|
333 |
def llm_pipeline():
|
334 |
pipe = pipeline(
|