Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

3794b5e

verified ·

1 Parent(s): 51ac619

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -24

app.py CHANGED Viewed

@@ -160,38 +160,29 @@ def data_ingestion():
                 file_path = os.path.join(uploaded_files_dir, filename)
                 logging.info(f"Processing file: {file_path}")
-                loader = PDFMinerLoader(file_path)
-                loaded_docs = loader.load()
-                # Check the structure of the loaded docs to ensure it has the correct format
-                for doc in loaded_docs:
-                    if isinstance(doc, dict):  # If the document is a dictionary
-                        # Extract text content if present in the dictionary
-                        if 'content' in doc:
-                            doc_content = doc['content']
-                        else:
-                            logging.warning(f"Skipping invalid document structure in {file_path}")
-                            continue
-                    elif hasattr(doc, 'page_content'):  # If the document is a proper object
-                        doc_content = doc.page_content
-                    else:
-                        logging.warning(f"Skipping invalid document structure in {file_path}")
                         continue
-                    # If document content exists, add it to the documents list
-                    if doc_content and len(doc_content.strip()) > 0:
-                        documents.append(doc)
-                    else:
-                        logging.warning(f"Skipping empty or invalid document: {file_path}")
         if not documents:
             logging.error("No valid documents found to process.")
             return
         logging.info(f"Total valid documents: {len(documents)}")
-        # Split documents into smaller chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)

                 file_path = os.path.join(uploaded_files_dir, filename)
                 logging.info(f"Processing file: {file_path}")
+                try:
+                    loader = PDFMinerLoader(file_path)
+                    loaded_docs = loader.load()
+                    if not loaded_docs:
+                        logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
                         continue
+                    for doc in loaded_docs:
+                        if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
+                            documents.append(doc)
+                        else:
+                            logging.warning(f"Skipping invalid document structure in {file_path}")
+                except ValueError as e:
+                    logging.error(f"Skipping {file_path}: {str(e)}")
+                    continue
         if not documents:
             logging.error("No valid documents found to process.")
             return
         logging.info(f"Total valid documents: {len(documents)}")
+        # Proceed with splitting and embedding documents
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)