sreesh2804 commited on
Commit
e7b7240
·
verified ·
1 Parent(s): 29569c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -80,12 +80,21 @@ def load_document(file_name, file_path):
80
 
81
  def process_documents(selected_files):
82
  global vector_store
 
 
 
 
 
 
83
  docs = []
84
  with concurrent.futures.ThreadPoolExecutor() as executor:
85
- future_to_file = {executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name for file_name in selected_files}
 
 
 
86
  for future in concurrent.futures.as_completed(future_to_file):
87
  docs.extend(future.result())
88
-
89
  total_words = sum(len(doc.page_content.split()) for doc in docs)
90
  if total_words < 1000:
91
  chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
@@ -93,16 +102,24 @@ def process_documents(selected_files):
93
  chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
94
  else:
95
  chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
96
-
97
  logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
98
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
99
  split_docs = text_splitter.split_documents(docs)
100
- embedding_model = "sentence-transformers/all-MiniLM-L6-v2" if file_size_category == "small" else "sentence-transformers/paraphrase-MiniLM-L3-v2"
 
 
 
101
  logging.info(f"🧠 Using Transformer Model: {embedding_model}")
 
102
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
 
103
  vector_store = Chroma.from_documents(split_docs, embeddings)
 
104
  return "✅ Documents processed successfully!"
105
 
 
106
  def query_document(question):
107
  if vector_store is None:
108
  return "❌ No documents processed.", None
 
80
 
81
  def process_documents(selected_files):
82
  global vector_store
83
+
84
+ # ✅ Clear the existing vector store before processing new documents
85
+ if vector_store is not None:
86
+ logging.info("🗑️ Clearing previous document embeddings...")
87
+ vector_store.delete_collection() # Clears existing stored data
88
+
89
  docs = []
90
  with concurrent.futures.ThreadPoolExecutor() as executor:
91
+ future_to_file = {
92
+ executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name
93
+ for file_name in selected_files
94
+ }
95
  for future in concurrent.futures.as_completed(future_to_file):
96
  docs.extend(future.result())
97
+
98
  total_words = sum(len(doc.page_content.split()) for doc in docs)
99
  if total_words < 1000:
100
  chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
 
102
  chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
103
  else:
104
  chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
105
+
106
  logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
107
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
108
  split_docs = text_splitter.split_documents(docs)
109
+
110
+ embedding_model = (
111
+ "sentence-transformers/all-MiniLM-L6-v2" if file_size_category == "small" else "sentence-transformers/paraphrase-MiniLM-L3-v2"
112
+ )
113
  logging.info(f"🧠 Using Transformer Model: {embedding_model}")
114
+
115
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
116
+
117
+ # ✅ Create a new Chroma vector store for new documents
118
  vector_store = Chroma.from_documents(split_docs, embeddings)
119
+
120
  return "✅ Documents processed successfully!"
121
 
122
+
123
  def query_document(question):
124
  if vector_store is None:
125
  return "❌ No documents processed.", None