Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ from typing import List
|
|
11 |
from together import Together
|
12 |
import pandas as pd
|
13 |
import streamlit as st
|
|
|
14 |
|
15 |
|
16 |
import numpy as np
|
@@ -191,7 +192,7 @@ st.markdown('<div class="chat-message">👋 سلام! چطور میتونم کم
|
|
191 |
# ⚙️ مدل Embedding ساده و سریع
|
192 |
@st.cache_resource
|
193 |
def get_embedding_model():
|
194 |
-
return SentenceTransformer(
|
195 |
|
196 |
@st.cache_resource
|
197 |
def process_csv(csv_file):
|
@@ -200,7 +201,7 @@ def process_csv(csv_file):
|
|
200 |
texts = [text for text in texts if text.strip()]
|
201 |
|
202 |
text_splitter = RecursiveCharacterTextSplitter(
|
203 |
-
chunk_size=
|
204 |
chunk_overlap=50,
|
205 |
length_function=len,
|
206 |
separators=["\n\n", "\n", " ", ""]
|
@@ -214,9 +215,9 @@ def process_csv(csv_file):
|
|
214 |
model = get_embedding_model()
|
215 |
embeddings = model.encode(split_texts, show_progress_bar=True)
|
216 |
|
217 |
-
# ساخت ایندکس FAISS
|
218 |
dim = embeddings.shape[1]
|
219 |
-
index = faiss.
|
|
|
220 |
index.add(np.array(embeddings))
|
221 |
|
222 |
return split_texts, embeddings, index
|
@@ -253,13 +254,22 @@ if st.session_state.pending_prompt:
|
|
253 |
thinking = st.empty()
|
254 |
thinking.markdown("🤖 در حال جستجو...")
|
255 |
|
256 |
-
# امبد کردن سؤال و جستجو
|
257 |
model = get_embedding_model()
|
258 |
query_vector = model.encode([st.session_state.pending_prompt])
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
thinking.empty()
|
265 |
full_response = ""
|
|
|
11 |
from together import Together
|
12 |
import pandas as pd
|
13 |
import streamlit as st
|
14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
|
16 |
|
17 |
import numpy as np
|
|
|
192 |
# ⚙️ مدل Embedding ساده و سریع
|
193 |
@st.cache_resource
|
194 |
def get_embedding_model():
|
195 |
+
return SentenceTransformer("HooshvareLab/bert-fa-zwnj-base")
|
196 |
|
197 |
@st.cache_resource
|
198 |
def process_csv(csv_file):
|
|
|
201 |
texts = [text for text in texts if text.strip()]
|
202 |
|
203 |
text_splitter = RecursiveCharacterTextSplitter(
|
204 |
+
chunk_size=200,
|
205 |
chunk_overlap=50,
|
206 |
length_function=len,
|
207 |
separators=["\n\n", "\n", " ", ""]
|
|
|
215 |
model = get_embedding_model()
|
216 |
embeddings = model.encode(split_texts, show_progress_bar=True)
|
217 |
|
|
|
218 |
dim = embeddings.shape[1]
|
219 |
+
index = faiss.IndexHNSWFlat(dim, 32)
|
220 |
+
index.hnsw.efSearch = 50
|
221 |
index.add(np.array(embeddings))
|
222 |
|
223 |
return split_texts, embeddings, index
|
|
|
254 |
thinking = st.empty()
|
255 |
thinking.markdown("🤖 در حال جستجو...")
|
256 |
|
|
|
257 |
model = get_embedding_model()
|
258 |
query_vector = model.encode([st.session_state.pending_prompt])
|
259 |
+
|
260 |
+
D, I = index.search(np.array(query_vector), k=10)
|
261 |
+
|
262 |
+
top_indices = I[0]
|
263 |
+
top_texts = [texts[i] for i in top_indices]
|
264 |
+
top_vectors = np.array([embeddings[i] for i in top_indices])
|
265 |
+
|
266 |
+
similarities = cosine_similarity(query_vector, top_vectors)[0]
|
267 |
+
|
268 |
+
# پیدا کردن دقیقترین متن
|
269 |
+
best_match_relative_index = np.argmax(similarities)
|
270 |
+
best_match_index = top_indices[best_match_relative_index]
|
271 |
+
best_match_text = texts[best_match_index]
|
272 |
+
response = "🧠 پاسخ سوال :\n\n" .join(best_match_text)
|
273 |
|
274 |
thinking.empty()
|
275 |
full_response = ""
|