Spaces:

M17idd
/

army

Running

App Files Files Community

M17idd commited on 7 days ago

Commit

145151e

verified ·

1 Parent(s): ad8873d

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -18

app.py CHANGED Viewed

@@ -15,6 +15,11 @@ from langchain.docstore.document import Document
 import docx
 import os
 st.markdown("""
     <style>
@@ -212,19 +217,23 @@ st.markdown(f"""
 # ----------------- لود csv و ساخت ایندکس -----------------
-class TogetherEmbeddings(Embeddings):
-    def __init__(self, model_name: str, api_key: str):
-        self.model_name = model_name
-        self.client = Together(api_key=api_key)
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        # تقسیم متن‌ها به دسته‌های کوچک‌تر برای جلوگیری از خطای 413
-        batch_size = 100  # این مقدار را می‌توانید تنظیم کنید
         embeddings = []
-        for i in range(0, len(texts), batch_size):
-            batch = texts[i:i + batch_size]
-            response = self.client.embeddings.create(model=self.model_name, input=batch)
-            embeddings.extend([item.embedding for item in response.data])
         return embeddings
     def embed_query(self, text: str) -> List[float]:
@@ -246,6 +255,10 @@ def get_docx_index(folder_path):
                 if file_text.strip():
                     texts.append(file_text)
         # تقسیم متن‌ها
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=300,
@@ -254,14 +267,11 @@ def get_docx_index(folder_path):
             separators=["\n\n", "\n", " ", ""]
         )
         split_texts = []
-        for text in texts:
             split_texts.extend(text_splitter.split_text(text))
-        # ایجاد embedding
-        embeddings = TogetherEmbeddings(
-            model_name="togethercomputer/m2-bert-80M-8k-retrieval",
-            api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
-        )
         # ساخت ایندکس
         index_creator = VectorstoreIndexCreator(
@@ -281,8 +291,6 @@ try:
 except Exception as e:
     st.error(f"❌ خطا در ساخت ایندکس: {e}")
 #------------------------------------------
 llm = ChatOpenAI(
     base_url="https://api.together.xyz/v1",

 import docx
 import os
+from hazm import *
 st.markdown("""
     <style>
 # ----------------- لود csv و ساخت ایندکس -----------------
+normalizer = Normalizer()
+# توکنایزر هضم
+tokenizer = word_tokenize
+# بارگذاری مدل WordEmbedding
+word_embedding = WordEmbedding(model_type='fasttext', model_path='word2vec.bin')
+class CustomEmbeddings(Embeddings):
+    def __init__(self, word_embedding: WordEmbedding):
+        self.word_embedding = word_embedding
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         embeddings = []
+        for text in texts:
+            # ایجاد امبدینگ برای هر کلمه در متن
+            embeddings.append([self.word_embedding.embed(word) for word in tokenizer(text)])
         return embeddings
     def embed_query(self, text: str) -> List[float]:
                 if file_text.strip():
                     texts.append(file_text)
+        # نرمال‌سازی و توکنایز کردن متن‌ها
+        normalized_texts = [normalizer.normalize(text) for text in texts]
+        tokenized_texts = [tokenizer(text) for text in normalized_texts]
         # تقسیم متن‌ها
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=300,
             separators=["\n\n", "\n", " ", ""]
         )
         split_texts = []
+        for text in normalized_texts:
             split_texts.extend(text_splitter.split_text(text))
+        # ایجاد embedding با استفاده از WordEmbedding
+        embeddings = CustomEmbeddings(word_embedding=word_embedding)
         # ساخت ایندکس
         index_creator = VectorstoreIndexCreator(
 except Exception as e:
     st.error(f"❌ خطا در ساخت ایندکس: {e}")
 #------------------------------------------
 llm = ChatOpenAI(
     base_url="https://api.together.xyz/v1",