M17idd commited on
Commit
2242369
·
verified ·
1 Parent(s): d9bc83c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -11,6 +11,7 @@ from typing import List
11
  from together import Together
12
  import pandas as pd
13
  import streamlit as st
 
14
 
15
 
16
  import numpy as np
@@ -191,7 +192,7 @@ st.markdown('<div class="chat-message">👋 سلام! چطور میتونم کم
191
  # ⚙️ مدل Embedding ساده و سریع
192
  @st.cache_resource
193
  def get_embedding_model():
194
- return SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
195
 
196
  @st.cache_resource
197
  def process_csv(csv_file):
@@ -200,7 +201,7 @@ def process_csv(csv_file):
200
  texts = [text for text in texts if text.strip()]
201
 
202
  text_splitter = RecursiveCharacterTextSplitter(
203
- chunk_size=300,
204
  chunk_overlap=50,
205
  length_function=len,
206
  separators=["\n\n", "\n", " ", ""]
@@ -214,9 +215,9 @@ def process_csv(csv_file):
214
  model = get_embedding_model()
215
  embeddings = model.encode(split_texts, show_progress_bar=True)
216
 
217
- # ساخت ایندکس FAISS
218
  dim = embeddings.shape[1]
219
- index = faiss.IndexFlatL2(dim)
 
220
  index.add(np.array(embeddings))
221
 
222
  return split_texts, embeddings, index
@@ -253,13 +254,22 @@ if st.session_state.pending_prompt:
253
  thinking = st.empty()
254
  thinking.markdown("🤖 در حال جستجو...")
255
 
256
- # امبد کردن سؤال و جستجو
257
  model = get_embedding_model()
258
  query_vector = model.encode([st.session_state.pending_prompt])
259
- D, I = index.search(np.array(query_vector), k=3) # 3 نتیجه نزدیک
260
-
261
- results = [texts[i] for i in I[0]]
262
- response = "🧠 نزدیک‌ترین پاسخ‌ها:\n\n" + "\n\n---\n\n".join(results)
 
 
 
 
 
 
 
 
 
 
263
 
264
  thinking.empty()
265
  full_response = ""
 
11
  from together import Together
12
  import pandas as pd
13
  import streamlit as st
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
 
16
 
17
  import numpy as np
 
192
  # ⚙️ مدل Embedding ساده و سریع
193
  @st.cache_resource
194
  def get_embedding_model():
195
+ return SentenceTransformer("HooshvareLab/bert-fa-zwnj-base")
196
 
197
  @st.cache_resource
198
  def process_csv(csv_file):
 
201
  texts = [text for text in texts if text.strip()]
202
 
203
  text_splitter = RecursiveCharacterTextSplitter(
204
+ chunk_size=200,
205
  chunk_overlap=50,
206
  length_function=len,
207
  separators=["\n\n", "\n", " ", ""]
 
215
  model = get_embedding_model()
216
  embeddings = model.encode(split_texts, show_progress_bar=True)
217
 
 
218
  dim = embeddings.shape[1]
219
+ index = faiss.IndexHNSWFlat(dim, 32)
220
+ index.hnsw.efSearch = 50
221
  index.add(np.array(embeddings))
222
 
223
  return split_texts, embeddings, index
 
254
  thinking = st.empty()
255
  thinking.markdown("🤖 در حال جستجو...")
256
 
 
257
  model = get_embedding_model()
258
  query_vector = model.encode([st.session_state.pending_prompt])
259
+
260
+ D, I = index.search(np.array(query_vector), k=10)
261
+
262
+ top_indices = I[0]
263
+ top_texts = [texts[i] for i in top_indices]
264
+ top_vectors = np.array([embeddings[i] for i in top_indices])
265
+
266
+ similarities = cosine_similarity(query_vector, top_vectors)[0]
267
+
268
+ # پیدا کردن دقیق‌ترین متن
269
+ best_match_relative_index = np.argmax(similarities)
270
+ best_match_index = top_indices[best_match_relative_index]
271
+ best_match_text = texts[best_match_index]
272
+ response = "🧠 پاسخ سوال :\n\n" .join(best_match_text)
273
 
274
  thinking.empty()
275
  full_response = ""