pradeepsengarr commited on
Commit
6956d92
·
verified ·
1 Parent(s): 7c797e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -16
app.py CHANGED
@@ -215,7 +215,8 @@ import math
215
  import streamlit as st
216
  import fitz # PyMuPDF
217
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
218
- from langchain_community.document_loaders import PDFMinerLoader
 
219
  from langchain.text_splitter import RecursiveCharacterTextSplitter
220
  from langchain_community.embeddings import SentenceTransformerEmbeddings
221
  from langchain_community.vectorstores import Chroma
@@ -253,46 +254,81 @@ def extract_outline_from_pdf(path):
253
  return f"Could not preview PDF: {e}"
254
 
255
  def data_ingestion():
 
256
  try:
257
  logging.info("Starting data ingestion")
 
258
  if not os.path.exists(uploaded_files_dir):
259
  os.makedirs(uploaded_files_dir)
260
 
261
- documents = []
262
  for filename in os.listdir(uploaded_files_dir):
263
  if filename.endswith(".pdf"):
264
- path = os.path.join(uploaded_files_dir, filename)
265
- logging.info(f"Loading: {filename}")
 
266
  try:
267
- loader = PDFMinerLoader(path)
268
  loaded_docs = loader.load()
 
 
 
 
 
 
269
  for doc in loaded_docs:
270
- if hasattr(doc, 'page_content'):
271
  documents.append(doc)
 
 
 
272
  except Exception as e:
273
- logging.warning(f"Skipping {filename}: {str(e)}")
 
274
 
275
  if not documents:
276
- st.error("⚠️ No valid documents found. Check the PDF content.")
277
  return
278
 
 
 
 
279
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
280
  texts = text_splitter.split_documents(documents)
281
 
 
 
 
 
 
 
282
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
 
 
 
 
283
  db = None
284
- MAX_BATCH_SIZE = 5461
285
- for i in range(0, len(texts), MAX_BATCH_SIZE):
286
- batch = texts[i:i + MAX_BATCH_SIZE]
 
 
 
 
287
  if db is None:
288
- db = Chroma.from_documents(batch, embeddings, persist_directory=persist_directory)
289
  else:
290
- db.add_documents(batch)
 
291
  db.persist()
292
- logging.info("Data ingestion completed.")
 
293
  except Exception as e:
294
- logging.error(f"Ingestion error: {e}")
295
- st.error(f"Ingestion failed: {e}")
 
296
 
297
  def llm_pipeline():
298
  pipe = pipeline(
 
215
  import streamlit as st
216
  import fitz # PyMuPDF
217
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
218
+ # from langchain_community.document_loaders import PDFMinerLoader
219
+ from langchain_community.document_loaders import PyMuPDFLoader
220
  from langchain.text_splitter import RecursiveCharacterTextSplitter
221
  from langchain_community.embeddings import SentenceTransformerEmbeddings
222
  from langchain_community.vectorstores import Chroma
 
254
  return f"Could not preview PDF: {e}"
255
 
256
  def data_ingestion():
257
+ """Load PDFs, validate content, and generate embeddings."""
258
  try:
259
  logging.info("Starting data ingestion")
260
+
261
  if not os.path.exists(uploaded_files_dir):
262
  os.makedirs(uploaded_files_dir)
263
 
264
+ documents = []
265
  for filename in os.listdir(uploaded_files_dir):
266
  if filename.endswith(".pdf"):
267
+ file_path = os.path.join(uploaded_files_dir, filename)
268
+ logging.info(f"Processing file: {file_path}")
269
+
270
  try:
271
+ loader = PyMuPDFLoader(file_path)
272
  loaded_docs = loader.load()
273
+
274
+ # Check if any content exists in loaded_docs
275
+ if not loaded_docs or len(loaded_docs[0].page_content.strip()) == 0:
276
+ logging.warning(f"No readable text found in {file_path}. Might be a scanned image or unsupported format.")
277
+ continue
278
+
279
  for doc in loaded_docs:
280
+ if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
281
  documents.append(doc)
282
+ else:
283
+ logging.warning(f"Skipping invalid document structure in {file_path}")
284
+
285
  except Exception as e:
286
+ logging.error(f"Skipping {file_path}: {str(e)}")
287
+ continue
288
 
289
  if not documents:
290
+ logging.error("No valid documents found to process.")
291
  return
292
 
293
+ logging.info(f"Total valid documents: {len(documents)}")
294
+
295
+ # Proceed with splitting and embedding documents
296
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
297
  texts = text_splitter.split_documents(documents)
298
 
299
+ logging.info(f"Total text chunks created: {len(texts)}")
300
+
301
+ if not texts:
302
+ logging.error("No valid text chunks to create embeddings.")
303
+ return
304
+
305
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
306
+
307
+ MAX_BATCH_SIZE = 5461
308
+ total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
309
+
310
+ logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
311
+
312
  db = None
313
+ for i in range(total_batches):
314
+ batch_start = i * MAX_BATCH_SIZE
315
+ batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
316
+ text_batch = texts[batch_start:batch_end]
317
+
318
+ logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
319
+
320
  if db is None:
321
+ db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
322
  else:
323
+ db.add_documents(text_batch)
324
+
325
  db.persist()
326
+ logging.info("Data ingestion completed successfully")
327
+
328
  except Exception as e:
329
+ logging.error(f"Error during data ingestion: {str(e)}")
330
+ raise
331
+
332
 
333
  def llm_pipeline():
334
  pipe = pipeline(