Spaces:

M17idd
/

army

Running

App Files Files Community

M17idd commited on 10 days ago

Commit

bc2155c

verified ·

1 Parent(s): d55f7c0

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -24

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ from typing import List
 from together import Together
 import pandas as pd
 import streamlit as st
 # ----------------- تنظیمات صفحه -----------------
 st.set_page_config(page_title="رزم یار ارتش", page_icon="🪖", layout="wide")
@@ -201,56 +203,55 @@ class TogetherEmbeddings(Embeddings):
         return self.embed_documents([text])[0]
 @st.cache_resource
-def get_csv_index(csv_file):
-    with st.spinner('📄 در حال پردازش فایل CSV...'):
-        # خواندن داده‌های CSV
-        df = pd.read_csv(csv_file)
-        # تبدیل DataFrame به لیست از متون
-        texts = df.iloc[:, 0].astype(str).tolist()  # ستون اول را می‌گیرد
-        # فیلتر کردن متن‌های خالی
-        texts = [text for text in texts if text.strip()]
-        # تقسیم متن‌های طولانی به بخش‌های کوچکتر
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=300,
             chunk_overlap=50,
             length_function=len,
             separators=["\n\n", "\n", " ", ""]
         )
         split_texts = []
         for text in texts:
             split_texts.extend(text_splitter.split_text(text))
-        # ایجاد embeddings
         embeddings = TogetherEmbeddings(
             model_name="togethercomputer/m2-bert-80M-8k-retrieval",
             api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
         )
-        # استفاده از VectorstoreIndexCreator برای ساخت ایندکس
         index_creator = VectorstoreIndexCreator(
             embedding=embeddings,
             text_splitter=text_splitter
         )
-        # تبدیل متون به اسناد (documents)
-        from langchain.docstore.document import Document
         documents = [Document(page_content=text) for text in split_texts]
         return index_creator.from_documents(documents)
 # مسیر فایل CSV
-csv_file_path = 'output (1).csv'
 try:
-    # ساخت ایندکس
-    csv_index = get_csv_index(csv_file_path)
-    st.success("ایندکس فایل CSV با موفقیت ساخته شد!")
 except Exception as e:
-    st.error(f"خطا در ساخت ایندکس: {str(e)}")

 from together import Together
 import pandas as pd
 import streamlit as st
+from docx import Document as DocxDocument
 # ----------------- تنظیمات صفحه -----------------
 st.set_page_config(page_title="رزم یار ارتش", page_icon="🪖", layout="wide")
         return self.embed_documents([text])[0]
 @st.cache_resource
+def get_docx_index(folder_path):
+    with st.spinner('📄 در حال پردازش فایل‌های Word...'):
+        texts = []
+        # خواندن تمام فایل‌های .docx در پوشه
+        for filename in os.listdir(folder_path):
+            if filename.endswith(".docx"):
+                full_path = os.path.join(folder_path, filename)
+                doc = DocxDocument(full_path)
+                # استخراج متن تمام پاراگراف‌ها
+                file_text = "\n".join([para.text for para in doc.paragraphs])
+                if file_text.strip():
+                    texts.append(file_text)
+        # تقسیم متن‌ها
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=300,
             chunk_overlap=50,
             length_function=len,
             separators=["\n\n", "\n", " ", ""]
         )
         split_texts = []
         for text in texts:
             split_texts.extend(text_splitter.split_text(text))
+        # ایجاد embedding
         embeddings = TogetherEmbeddings(
             model_name="togethercomputer/m2-bert-80M-8k-retrieval",
             api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
         )
+        # ساخت ایندکس
         index_creator = VectorstoreIndexCreator(
             embedding=embeddings,
             text_splitter=text_splitter
         )
         documents = [Document(page_content=text) for text in split_texts]
         return index_creator.from_documents(documents)
 # مسیر فایل CSV
+folder_path = 'docs_folder'
 try:
+    docx_index = get_docx_index(folder_path)
 except Exception as e:
+    st.error(f"❌ خطا در ساخت ایندکس: {e}")