Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -147,7 +147,7 @@ def extract_text_from_pdf(file_path):
|
|
147 |
return None
|
148 |
|
149 |
def data_ingestion():
|
150 |
-
"""
|
151 |
try:
|
152 |
logging.info("Starting data ingestion")
|
153 |
|
@@ -159,22 +159,39 @@ def data_ingestion():
|
|
159 |
if filename.endswith(".pdf"):
|
160 |
file_path = os.path.join(uploaded_files_dir, filename)
|
161 |
logging.info(f"Processing file: {file_path}")
|
162 |
-
|
163 |
-
# Extract text using PyMuPDF
|
164 |
-
text = extract_text_from_pdf(file_path)
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
if not documents:
|
172 |
logging.error("No valid documents found to process.")
|
173 |
return
|
174 |
|
175 |
logging.info(f"Total valid documents: {len(documents)}")
|
176 |
-
|
177 |
-
# Split
|
178 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
179 |
texts = text_splitter.split_documents(documents)
|
180 |
|
@@ -185,9 +202,9 @@ def data_ingestion():
|
|
185 |
return
|
186 |
|
187 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
188 |
-
|
189 |
-
#
|
190 |
-
MAX_BATCH_SIZE = 5461
|
191 |
total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
|
192 |
|
193 |
logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
|
|
|
147 |
return None
|
148 |
|
149 |
def data_ingestion():
|
150 |
+
"""Function to load PDFs and create embeddings with improved error handling and efficiency."""
|
151 |
try:
|
152 |
logging.info("Starting data ingestion")
|
153 |
|
|
|
159 |
if filename.endswith(".pdf"):
|
160 |
file_path = os.path.join(uploaded_files_dir, filename)
|
161 |
logging.info(f"Processing file: {file_path}")
|
|
|
|
|
|
|
162 |
|
163 |
+
loader = PDFMinerLoader(file_path)
|
164 |
+
|
165 |
+
loaded_docs = loader.load()
|
166 |
+
|
167 |
+
# Check the structure of the loaded docs to ensure it has the correct format
|
168 |
+
for doc in loaded_docs:
|
169 |
+
if isinstance(doc, dict): # If the document is a dictionary
|
170 |
+
# Extract text content if present in the dictionary
|
171 |
+
if 'content' in doc:
|
172 |
+
doc_content = doc['content']
|
173 |
+
else:
|
174 |
+
logging.warning(f"Skipping invalid document structure in {file_path}")
|
175 |
+
continue
|
176 |
+
elif hasattr(doc, 'page_content'): # If the document is a proper object
|
177 |
+
doc_content = doc.page_content
|
178 |
+
else:
|
179 |
+
logging.warning(f"Skipping invalid document structure in {file_path}")
|
180 |
+
continue
|
181 |
+
|
182 |
+
# If document content exists, add it to the documents list
|
183 |
+
if doc_content and len(doc_content.strip()) > 0:
|
184 |
+
documents.append(doc)
|
185 |
+
else:
|
186 |
+
logging.warning(f"Skipping empty or invalid document: {file_path}")
|
187 |
|
188 |
if not documents:
|
189 |
logging.error("No valid documents found to process.")
|
190 |
return
|
191 |
|
192 |
logging.info(f"Total valid documents: {len(documents)}")
|
193 |
+
|
194 |
+
# Split documents into smaller chunks
|
195 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
196 |
texts = text_splitter.split_documents(documents)
|
197 |
|
|
|
202 |
return
|
203 |
|
204 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
205 |
+
|
206 |
+
# Proceed to split and embed the documents
|
207 |
+
MAX_BATCH_SIZE = 5461
|
208 |
total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
|
209 |
|
210 |
logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
|