Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -80,12 +80,21 @@ def load_document(file_name, file_path):
|
|
80 |
|
81 |
def process_documents(selected_files):
|
82 |
global vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
docs = []
|
84 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
85 |
-
future_to_file = {
|
|
|
|
|
|
|
86 |
for future in concurrent.futures.as_completed(future_to_file):
|
87 |
docs.extend(future.result())
|
88 |
-
|
89 |
total_words = sum(len(doc.page_content.split()) for doc in docs)
|
90 |
if total_words < 1000:
|
91 |
chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
|
@@ -93,16 +102,24 @@ def process_documents(selected_files):
|
|
93 |
chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
|
94 |
else:
|
95 |
chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
|
96 |
-
|
97 |
logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
|
98 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
99 |
split_docs = text_splitter.split_documents(docs)
|
100 |
-
|
|
|
|
|
|
|
101 |
logging.info(f"🧠 Using Transformer Model: {embedding_model}")
|
|
|
102 |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
|
|
|
|
103 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
|
|
104 |
return "✅ Documents processed successfully!"
|
105 |
|
|
|
106 |
def query_document(question):
|
107 |
if vector_store is None:
|
108 |
return "❌ No documents processed.", None
|
|
|
80 |
|
81 |
def process_documents(selected_files):
|
82 |
global vector_store
|
83 |
+
|
84 |
+
# ✅ Clear the existing vector store before processing new documents
|
85 |
+
if vector_store is not None:
|
86 |
+
logging.info("🗑️ Clearing previous document embeddings...")
|
87 |
+
vector_store.delete_collection() # Clears existing stored data
|
88 |
+
|
89 |
docs = []
|
90 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
91 |
+
future_to_file = {
|
92 |
+
executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name
|
93 |
+
for file_name in selected_files
|
94 |
+
}
|
95 |
for future in concurrent.futures.as_completed(future_to_file):
|
96 |
docs.extend(future.result())
|
97 |
+
|
98 |
total_words = sum(len(doc.page_content.split()) for doc in docs)
|
99 |
if total_words < 1000:
|
100 |
chunk_size, chunk_overlap, file_size_category = 500, 50, "small"
|
|
|
102 |
chunk_size, chunk_overlap, file_size_category = 1000, 100, "medium"
|
103 |
else:
|
104 |
chunk_size, chunk_overlap, file_size_category = 2000, 200, "large"
|
105 |
+
|
106 |
logging.info(f"📄 Document Size: {total_words} words | Category: {file_size_category} | Chunk Size: {chunk_size}")
|
107 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
108 |
split_docs = text_splitter.split_documents(docs)
|
109 |
+
|
110 |
+
embedding_model = (
|
111 |
+
"sentence-transformers/all-MiniLM-L6-v2" if file_size_category == "small" else "sentence-transformers/paraphrase-MiniLM-L3-v2"
|
112 |
+
)
|
113 |
logging.info(f"🧠 Using Transformer Model: {embedding_model}")
|
114 |
+
|
115 |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
116 |
+
|
117 |
+
# ✅ Create a new Chroma vector store for new documents
|
118 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
119 |
+
|
120 |
return "✅ Documents processed successfully!"
|
121 |
|
122 |
+
|
123 |
def query_document(question):
|
124 |
if vector_store is None:
|
125 |
return "❌ No documents processed.", None
|