Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

51ac619

verified ·

1 Parent(s): 3fc3a1e

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -13

app.py CHANGED Viewed

@@ -147,7 +147,7 @@ def extract_text_from_pdf(file_path):
         return None
 def data_ingestion():
-    """Load PDFs and create embeddings."""
     try:
         logging.info("Starting data ingestion")
@@ -159,22 +159,39 @@ def data_ingestion():
             if filename.endswith(".pdf"):
                 file_path = os.path.join(uploaded_files_dir, filename)
                 logging.info(f"Processing file: {file_path}")
-                # Extract text using PyMuPDF
-                text = extract_text_from_pdf(file_path)
-                if text:
-                    documents.append({"page_content": text, "source": file_path})
-                else:
-                    logging.warning(f"Skipping file due to extraction error: {file_path}")
         if not documents:
             logging.error("No valid documents found to process.")
             return
         logging.info(f"Total valid documents: {len(documents)}")
-        # Split the documents into chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
@@ -185,9 +202,9 @@ def data_ingestion():
             return
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-        # Process text chunks (embedding and persistence)
-        MAX_BATCH_SIZE = 5461
         total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
         logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")

         return None
 def data_ingestion():
+    """Function to load PDFs and create embeddings with improved error handling and efficiency."""
     try:
         logging.info("Starting data ingestion")
             if filename.endswith(".pdf"):
                 file_path = os.path.join(uploaded_files_dir, filename)
                 logging.info(f"Processing file: {file_path}")
+                loader = PDFMinerLoader(file_path)
+                loaded_docs = loader.load()
+                # Check the structure of the loaded docs to ensure it has the correct format
+                for doc in loaded_docs:
+                    if isinstance(doc, dict):  # If the document is a dictionary
+                        # Extract text content if present in the dictionary
+                        if 'content' in doc:
+                            doc_content = doc['content']
+                        else:
+                            logging.warning(f"Skipping invalid document structure in {file_path}")
+                            continue
+                    elif hasattr(doc, 'page_content'):  # If the document is a proper object
+                        doc_content = doc.page_content
+                    else:
+                        logging.warning(f"Skipping invalid document structure in {file_path}")
+                        continue
+                    # If document content exists, add it to the documents list
+                    if doc_content and len(doc_content.strip()) > 0:
+                        documents.append(doc)
+                    else:
+                        logging.warning(f"Skipping empty or invalid document: {file_path}")
         if not documents:
             logging.error("No valid documents found to process.")
             return
         logging.info(f"Total valid documents: {len(documents)}")
+        # Split documents into smaller chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
             return
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+        # Proceed to split and embed the documents
+        MAX_BATCH_SIZE = 5461
         total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
         logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")