Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -160,38 +160,29 @@ def data_ingestion():
|
|
160 |
file_path = os.path.join(uploaded_files_dir, filename)
|
161 |
logging.info(f"Processing file: {file_path}")
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
for doc in loaded_docs:
|
169 |
-
if isinstance(doc, dict): # If the document is a dictionary
|
170 |
-
# Extract text content if present in the dictionary
|
171 |
-
if 'content' in doc:
|
172 |
-
doc_content = doc['content']
|
173 |
-
else:
|
174 |
-
logging.warning(f"Skipping invalid document structure in {file_path}")
|
175 |
-
continue
|
176 |
-
elif hasattr(doc, 'page_content'): # If the document is a proper object
|
177 |
-
doc_content = doc.page_content
|
178 |
-
else:
|
179 |
-
logging.warning(f"Skipping invalid document structure in {file_path}")
|
180 |
continue
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
187 |
|
188 |
if not documents:
|
189 |
logging.error("No valid documents found to process.")
|
190 |
return
|
191 |
|
192 |
logging.info(f"Total valid documents: {len(documents)}")
|
193 |
-
|
194 |
-
#
|
195 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
196 |
texts = text_splitter.split_documents(documents)
|
197 |
|
|
|
160 |
file_path = os.path.join(uploaded_files_dir, filename)
|
161 |
logging.info(f"Processing file: {file_path}")
|
162 |
|
163 |
+
try:
|
164 |
+
loader = PDFMinerLoader(file_path)
|
165 |
+
loaded_docs = loader.load()
|
166 |
+
if not loaded_docs:
|
167 |
+
logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
continue
|
169 |
|
170 |
+
for doc in loaded_docs:
|
171 |
+
if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
|
172 |
+
documents.append(doc)
|
173 |
+
else:
|
174 |
+
logging.warning(f"Skipping invalid document structure in {file_path}")
|
175 |
+
except ValueError as e:
|
176 |
+
logging.error(f"Skipping {file_path}: {str(e)}")
|
177 |
+
continue
|
178 |
|
179 |
if not documents:
|
180 |
logging.error("No valid documents found to process.")
|
181 |
return
|
182 |
|
183 |
logging.info(f"Total valid documents: {len(documents)}")
|
184 |
+
|
185 |
+
# Proceed with splitting and embedding documents
|
186 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
187 |
texts = text_splitter.split_documents(documents)
|
188 |
|