pradeepsengarr commited on
Commit
3794b5e
·
verified ·
1 Parent(s): 51ac619

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -24
app.py CHANGED
@@ -160,38 +160,29 @@ def data_ingestion():
160
  file_path = os.path.join(uploaded_files_dir, filename)
161
  logging.info(f"Processing file: {file_path}")
162
 
163
- loader = PDFMinerLoader(file_path)
164
-
165
- loaded_docs = loader.load()
166
-
167
- # Check the structure of the loaded docs to ensure it has the correct format
168
- for doc in loaded_docs:
169
- if isinstance(doc, dict): # If the document is a dictionary
170
- # Extract text content if present in the dictionary
171
- if 'content' in doc:
172
- doc_content = doc['content']
173
- else:
174
- logging.warning(f"Skipping invalid document structure in {file_path}")
175
- continue
176
- elif hasattr(doc, 'page_content'): # If the document is a proper object
177
- doc_content = doc.page_content
178
- else:
179
- logging.warning(f"Skipping invalid document structure in {file_path}")
180
  continue
181
 
182
- # If document content exists, add it to the documents list
183
- if doc_content and len(doc_content.strip()) > 0:
184
- documents.append(doc)
185
- else:
186
- logging.warning(f"Skipping empty or invalid document: {file_path}")
 
 
 
187
 
188
  if not documents:
189
  logging.error("No valid documents found to process.")
190
  return
191
 
192
  logging.info(f"Total valid documents: {len(documents)}")
193
-
194
- # Split documents into smaller chunks
195
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
196
  texts = text_splitter.split_documents(documents)
197
 
 
160
  file_path = os.path.join(uploaded_files_dir, filename)
161
  logging.info(f"Processing file: {file_path}")
162
 
163
+ try:
164
+ loader = PDFMinerLoader(file_path)
165
+ loaded_docs = loader.load()
166
+ if not loaded_docs:
167
+ logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
168
  continue
169
 
170
+ for doc in loaded_docs:
171
+ if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
172
+ documents.append(doc)
173
+ else:
174
+ logging.warning(f"Skipping invalid document structure in {file_path}")
175
+ except ValueError as e:
176
+ logging.error(f"Skipping {file_path}: {str(e)}")
177
+ continue
178
 
179
  if not documents:
180
  logging.error("No valid documents found to process.")
181
  return
182
 
183
  logging.info(f"Total valid documents: {len(documents)}")
184
+
185
+ # Proceed with splitting and embedding documents
186
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
187
  texts = text_splitter.split_documents(documents)
188