Spaces:

sreesh2804
/

Doc_Chatbot

Running

App Files Files Community

sreesh2804 commited on about 1 month ago

Commit

e7b7240

verified ·

1 Parent(s): 29569c2

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -4

app.py CHANGED Viewed

@@ -80,12 +80,21 @@ def load_document(file_name, file_path):
 def process_documents(selected_files):
     global vector_store
     docs = []
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        future_to_file = {executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name for file_name in selected_files}
         for future in concurrent.futures.as_completed(future_to_file):
             docs.extend(future.result())
     total_words = sum(len(doc.page_content.split()) for doc in docs)
     if total_words < 1000:
         chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
@@ -93,16 +102,24 @@ def process_documents(selected_files):
         chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
     else:
         chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
     logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     split_docs = text_splitter.split_documents(docs)
-    embedding_model = "sentence-transformers/all-MiniLM-L6-v2" if file_size_category == "small" else "sentence-transformers/paraphrase-MiniLM-L3-v2"
     logging.info(f"🧠 Using Transformer Model: {embedding_model}")
     embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
     vector_store = Chroma.from_documents(split_docs, embeddings)
     return "✅ Documents processed successfully!"
 def query_document(question):
     if vector_store is None:
         return "❌ No documents processed.", None

 def process_documents(selected_files):
     global vector_store
+    # ✅ Clear the existing vector store before processing new documents
+    if vector_store is not None:
+        logging.info("🗑️ Clearing previous document embeddings...")
+        vector_store.delete_collection()  # Clears existing stored data
     docs = []
     with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_file = {
+            executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name
+            for file_name in selected_files
+        }
         for future in concurrent.futures.as_completed(future_to_file):
             docs.extend(future.result())
     total_words = sum(len(doc.page_content.split()) for doc in docs)
     if total_words < 1000:
         chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
         chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
     else:
         chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
     logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     split_docs = text_splitter.split_documents(docs)
+    embedding_model = (
+        "sentence-transformers/all-MiniLM-L6-v2" if file_size_category == "small" else "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    )
     logging.info(f"🧠 Using Transformer Model: {embedding_model}")
     embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+    # ✅ Create a new Chroma vector store for new documents
     vector_store = Chroma.from_documents(split_docs, embeddings)
     return "✅ Documents processed successfully!"
 def query_document(question):
     if vector_store is None:
         return "❌ No documents processed.", None