Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,11 @@ from langchain.docstore.document import Document
|
|
15 |
import docx
|
16 |
import os
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
st.markdown("""
|
20 |
<style>
|
@@ -212,19 +217,23 @@ st.markdown(f"""
|
|
212 |
|
213 |
|
214 |
# ----------------- لود csv و ساخت ایندکس -----------------
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
221 |
-
# تقسیم متنها به دستههای کوچکتر برای جلوگیری از خطای 413
|
222 |
-
batch_size = 100 # این مقدار را میتوانید تنظیم کنید
|
223 |
embeddings = []
|
224 |
-
for
|
225 |
-
|
226 |
-
|
227 |
-
embeddings.extend([item.embedding for item in response.data])
|
228 |
return embeddings
|
229 |
|
230 |
def embed_query(self, text: str) -> List[float]:
|
@@ -246,6 +255,10 @@ def get_docx_index(folder_path):
|
|
246 |
if file_text.strip():
|
247 |
texts.append(file_text)
|
248 |
|
|
|
|
|
|
|
|
|
249 |
# تقسیم متنها
|
250 |
text_splitter = RecursiveCharacterTextSplitter(
|
251 |
chunk_size=300,
|
@@ -254,14 +267,11 @@ def get_docx_index(folder_path):
|
|
254 |
separators=["\n\n", "\n", " ", ""]
|
255 |
)
|
256 |
split_texts = []
|
257 |
-
for text in
|
258 |
split_texts.extend(text_splitter.split_text(text))
|
259 |
|
260 |
-
# ایجاد embedding
|
261 |
-
embeddings =
|
262 |
-
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
263 |
-
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
264 |
-
)
|
265 |
|
266 |
# ساخت ایندکس
|
267 |
index_creator = VectorstoreIndexCreator(
|
@@ -281,8 +291,6 @@ try:
|
|
281 |
except Exception as e:
|
282 |
st.error(f"❌ خطا در ساخت ایندکس: {e}")
|
283 |
|
284 |
-
|
285 |
-
|
286 |
#------------------------------------------
|
287 |
llm = ChatOpenAI(
|
288 |
base_url="https://api.together.xyz/v1",
|
|
|
15 |
import docx
|
16 |
import os
|
17 |
|
18 |
+
from hazm import *
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
|
24 |
st.markdown("""
|
25 |
<style>
|
|
|
217 |
|
218 |
|
219 |
# ----------------- لود csv و ساخت ایندکس -----------------
|
220 |
+
normalizer = Normalizer()
|
221 |
+
|
222 |
+
# توکنایزر هضم
|
223 |
+
tokenizer = word_tokenize
|
224 |
+
|
225 |
+
# بارگذاری مدل WordEmbedding
|
226 |
+
word_embedding = WordEmbedding(model_type='fasttext', model_path='word2vec.bin')
|
227 |
+
|
228 |
+
class CustomEmbeddings(Embeddings):
|
229 |
+
def __init__(self, word_embedding: WordEmbedding):
|
230 |
+
self.word_embedding = word_embedding
|
231 |
|
232 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
|
|
|
233 |
embeddings = []
|
234 |
+
for text in texts:
|
235 |
+
# ایجاد امبدینگ برای هر کلمه در متن
|
236 |
+
embeddings.append([self.word_embedding.embed(word) for word in tokenizer(text)])
|
|
|
237 |
return embeddings
|
238 |
|
239 |
def embed_query(self, text: str) -> List[float]:
|
|
|
255 |
if file_text.strip():
|
256 |
texts.append(file_text)
|
257 |
|
258 |
+
# نرمالسازی و توکنایز کردن متنها
|
259 |
+
normalized_texts = [normalizer.normalize(text) for text in texts]
|
260 |
+
tokenized_texts = [tokenizer(text) for text in normalized_texts]
|
261 |
+
|
262 |
# تقسیم متنها
|
263 |
text_splitter = RecursiveCharacterTextSplitter(
|
264 |
chunk_size=300,
|
|
|
267 |
separators=["\n\n", "\n", " ", ""]
|
268 |
)
|
269 |
split_texts = []
|
270 |
+
for text in normalized_texts:
|
271 |
split_texts.extend(text_splitter.split_text(text))
|
272 |
|
273 |
+
# ایجاد embedding با استفاده از WordEmbedding
|
274 |
+
embeddings = CustomEmbeddings(word_embedding=word_embedding)
|
|
|
|
|
|
|
275 |
|
276 |
# ساخت ایندکس
|
277 |
index_creator = VectorstoreIndexCreator(
|
|
|
291 |
except Exception as e:
|
292 |
st.error(f"❌ خطا در ساخت ایندکس: {e}")
|
293 |
|
|
|
|
|
294 |
#------------------------------------------
|
295 |
llm = ChatOpenAI(
|
296 |
base_url="https://api.together.xyz/v1",
|