pradeepsengarr commited on
Commit
51ac619
·
verified ·
1 Parent(s): 3fc3a1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -13
app.py CHANGED
@@ -147,7 +147,7 @@ def extract_text_from_pdf(file_path):
147
  return None
148
 
149
  def data_ingestion():
150
- """Load PDFs and create embeddings."""
151
  try:
152
  logging.info("Starting data ingestion")
153
 
@@ -159,22 +159,39 @@ def data_ingestion():
159
  if filename.endswith(".pdf"):
160
  file_path = os.path.join(uploaded_files_dir, filename)
161
  logging.info(f"Processing file: {file_path}")
162
-
163
- # Extract text using PyMuPDF
164
- text = extract_text_from_pdf(file_path)
165
 
166
- if text:
167
- documents.append({"page_content": text, "source": file_path})
168
- else:
169
- logging.warning(f"Skipping file due to extraction error: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  if not documents:
172
  logging.error("No valid documents found to process.")
173
  return
174
 
175
  logging.info(f"Total valid documents: {len(documents)}")
176
-
177
- # Split the documents into chunks
178
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
179
  texts = text_splitter.split_documents(documents)
180
 
@@ -185,9 +202,9 @@ def data_ingestion():
185
  return
186
 
187
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
188
-
189
- # Process text chunks (embedding and persistence)
190
- MAX_BATCH_SIZE = 5461
191
  total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
192
 
193
  logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
 
147
  return None
148
 
149
  def data_ingestion():
150
+ """Function to load PDFs and create embeddings with improved error handling and efficiency."""
151
  try:
152
  logging.info("Starting data ingestion")
153
 
 
159
  if filename.endswith(".pdf"):
160
  file_path = os.path.join(uploaded_files_dir, filename)
161
  logging.info(f"Processing file: {file_path}")
 
 
 
162
 
163
+ loader = PDFMinerLoader(file_path)
164
+
165
+ loaded_docs = loader.load()
166
+
167
+ # Check the structure of the loaded docs to ensure it has the correct format
168
+ for doc in loaded_docs:
169
+ if isinstance(doc, dict): # If the document is a dictionary
170
+ # Extract text content if present in the dictionary
171
+ if 'content' in doc:
172
+ doc_content = doc['content']
173
+ else:
174
+ logging.warning(f"Skipping invalid document structure in {file_path}")
175
+ continue
176
+ elif hasattr(doc, 'page_content'): # If the document is a proper object
177
+ doc_content = doc.page_content
178
+ else:
179
+ logging.warning(f"Skipping invalid document structure in {file_path}")
180
+ continue
181
+
182
+ # If document content exists, add it to the documents list
183
+ if doc_content and len(doc_content.strip()) > 0:
184
+ documents.append(doc)
185
+ else:
186
+ logging.warning(f"Skipping empty or invalid document: {file_path}")
187
 
188
  if not documents:
189
  logging.error("No valid documents found to process.")
190
  return
191
 
192
  logging.info(f"Total valid documents: {len(documents)}")
193
+
194
+ # Split documents into smaller chunks
195
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
196
  texts = text_splitter.split_documents(documents)
197
 
 
202
  return
203
 
204
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
205
+
206
+ # Proceed to split and embed the documents
207
+ MAX_BATCH_SIZE = 5461
208
  total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
209
 
210
  logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")