Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

6956d92

verified ·

1 Parent(s): 7c797e6

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -16

app.py CHANGED Viewed

@@ -215,7 +215,8 @@ import math
 import streamlit as st
 import fitz  # PyMuPDF
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-from langchain_community.document_loaders import PDFMinerLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.vectorstores import Chroma
@@ -253,46 +254,81 @@ def extract_outline_from_pdf(path):
         return f"Could not preview PDF: {e}"
 def data_ingestion():
     try:
         logging.info("Starting data ingestion")
         if not os.path.exists(uploaded_files_dir):
             os.makedirs(uploaded_files_dir)
-        documents = []
         for filename in os.listdir(uploaded_files_dir):
             if filename.endswith(".pdf"):
-                path = os.path.join(uploaded_files_dir, filename)
-                logging.info(f"Loading: {filename}")
                 try:
-                    loader = PDFMinerLoader(path)
                     loaded_docs = loader.load()
                     for doc in loaded_docs:
-                        if hasattr(doc, 'page_content'):
                             documents.append(doc)
                 except Exception as e:
-                    logging.warning(f"Skipping {filename}: {str(e)}")
         if not documents:
-            st.error("⚠️ No valid documents found. Check the PDF content.")
             return
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
         db = None
-        MAX_BATCH_SIZE = 5461
-        for i in range(0, len(texts), MAX_BATCH_SIZE):
-            batch = texts[i:i + MAX_BATCH_SIZE]
             if db is None:
-                db = Chroma.from_documents(batch, embeddings, persist_directory=persist_directory)
             else:
-                db.add_documents(batch)
         db.persist()
-        logging.info("Data ingestion completed.")
     except Exception as e:
-        logging.error(f"Ingestion error: {e}")
-        st.error(f"Ingestion failed: {e}")
 def llm_pipeline():
     pipe = pipeline(

 import streamlit as st
 import fitz  # PyMuPDF
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# from langchain_community.document_loaders import PDFMinerLoader
+from langchain_community.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.vectorstores import Chroma
         return f"Could not preview PDF: {e}"
 def data_ingestion():
+    """Load PDFs, validate content, and generate embeddings."""
     try:
         logging.info("Starting data ingestion")
         if not os.path.exists(uploaded_files_dir):
             os.makedirs(uploaded_files_dir)
+        documents = []
         for filename in os.listdir(uploaded_files_dir):
             if filename.endswith(".pdf"):
+                file_path = os.path.join(uploaded_files_dir, filename)
+                logging.info(f"Processing file: {file_path}")
                 try:
+                    loader = PyMuPDFLoader(file_path)
                     loaded_docs = loader.load()
+                    # Check if any content exists in loaded_docs
+                    if not loaded_docs or len(loaded_docs[0].page_content.strip()) == 0:
+                        logging.warning(f"No readable text found in {file_path}. Might be a scanned image or unsupported format.")
+                        continue
                     for doc in loaded_docs:
+                        if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
                             documents.append(doc)
+                        else:
+                            logging.warning(f"Skipping invalid document structure in {file_path}")
                 except Exception as e:
+                    logging.error(f"Skipping {file_path}: {str(e)}")
+                    continue
         if not documents:
+            logging.error("No valid documents found to process.")
             return
+        logging.info(f"Total valid documents: {len(documents)}")
+        # Proceed with splitting and embedding documents
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
+        logging.info(f"Total text chunks created: {len(texts)}")
+        if not texts:
+            logging.error("No valid text chunks to create embeddings.")
+            return
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+        MAX_BATCH_SIZE = 5461
+        total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
+        logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
         db = None
+        for i in range(total_batches):
+            batch_start = i * MAX_BATCH_SIZE
+            batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
+            text_batch = texts[batch_start:batch_end]
+            logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
             if db is None:
+                db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
             else:
+                db.add_documents(text_batch)
         db.persist()
+        logging.info("Data ingestion completed successfully")
     except Exception as e:
+        logging.error(f"Error during data ingestion: {str(e)}")
+        raise
 def llm_pipeline():
     pipe = pipeline(