Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ from typing import List
|
|
11 |
from together import Together
|
12 |
import pandas as pd
|
13 |
import streamlit as st
|
|
|
|
|
14 |
|
15 |
# ----------------- تنظیمات صفحه -----------------
|
16 |
st.set_page_config(page_title="رزم یار ارتش", page_icon="🪖", layout="wide")
|
@@ -201,56 +203,55 @@ class TogetherEmbeddings(Embeddings):
|
|
201 |
return self.embed_documents([text])[0]
|
202 |
|
203 |
@st.cache_resource
|
204 |
-
def
|
205 |
-
with st.spinner('📄 در حال پردازش
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
216 |
text_splitter = RecursiveCharacterTextSplitter(
|
217 |
chunk_size=300,
|
218 |
chunk_overlap=50,
|
219 |
length_function=len,
|
220 |
separators=["\n\n", "\n", " ", ""]
|
221 |
)
|
222 |
-
|
223 |
split_texts = []
|
224 |
for text in texts:
|
225 |
split_texts.extend(text_splitter.split_text(text))
|
226 |
|
227 |
-
# ایجاد
|
228 |
embeddings = TogetherEmbeddings(
|
229 |
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
230 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
231 |
)
|
232 |
|
233 |
-
#
|
234 |
index_creator = VectorstoreIndexCreator(
|
235 |
embedding=embeddings,
|
236 |
text_splitter=text_splitter
|
237 |
)
|
238 |
-
|
239 |
-
# تبدیل متون به اسناد (documents)
|
240 |
-
from langchain.docstore.document import Document
|
241 |
documents = [Document(page_content=text) for text in split_texts]
|
242 |
-
|
243 |
return index_creator.from_documents(documents)
|
244 |
|
245 |
# مسیر فایل CSV
|
246 |
-
|
247 |
|
248 |
try:
|
249 |
-
|
250 |
-
csv_index = get_csv_index(csv_file_path)
|
251 |
-
st.success("ایندکس فایل CSV با موفقیت ساخته شد!")
|
252 |
except Exception as e:
|
253 |
-
st.error(f"خطا در ساخت ایندکس: {
|
254 |
|
255 |
|
256 |
|
|
|
11 |
from together import Together
|
12 |
import pandas as pd
|
13 |
import streamlit as st
|
14 |
+
from docx import Document as DocxDocument
|
15 |
+
|
16 |
|
17 |
# ----------------- تنظیمات صفحه -----------------
|
18 |
st.set_page_config(page_title="رزم یار ارتش", page_icon="🪖", layout="wide")
|
|
|
203 |
return self.embed_documents([text])[0]
|
204 |
|
205 |
@st.cache_resource
|
206 |
+
def get_docx_index(folder_path):
|
207 |
+
with st.spinner('📄 در حال پردازش فایلهای Word...'):
|
208 |
+
texts = []
|
209 |
+
|
210 |
+
# خواندن تمام فایلهای .docx در پوشه
|
211 |
+
for filename in os.listdir(folder_path):
|
212 |
+
if filename.endswith(".docx"):
|
213 |
+
full_path = os.path.join(folder_path, filename)
|
214 |
+
doc = DocxDocument(full_path)
|
215 |
+
|
216 |
+
# استخراج متن تمام پاراگرافها
|
217 |
+
file_text = "\n".join([para.text for para in doc.paragraphs])
|
218 |
+
if file_text.strip():
|
219 |
+
texts.append(file_text)
|
220 |
+
|
221 |
+
# تقسیم متنها
|
222 |
text_splitter = RecursiveCharacterTextSplitter(
|
223 |
chunk_size=300,
|
224 |
chunk_overlap=50,
|
225 |
length_function=len,
|
226 |
separators=["\n\n", "\n", " ", ""]
|
227 |
)
|
|
|
228 |
split_texts = []
|
229 |
for text in texts:
|
230 |
split_texts.extend(text_splitter.split_text(text))
|
231 |
|
232 |
+
# ایجاد embedding
|
233 |
embeddings = TogetherEmbeddings(
|
234 |
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
235 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
236 |
)
|
237 |
|
238 |
+
# ساخت ایندکس
|
239 |
index_creator = VectorstoreIndexCreator(
|
240 |
embedding=embeddings,
|
241 |
text_splitter=text_splitter
|
242 |
)
|
243 |
+
|
|
|
|
|
244 |
documents = [Document(page_content=text) for text in split_texts]
|
245 |
+
|
246 |
return index_creator.from_documents(documents)
|
247 |
|
248 |
# مسیر فایل CSV
|
249 |
+
folder_path = 'docs_folder'
|
250 |
|
251 |
try:
|
252 |
+
docx_index = get_docx_index(folder_path)
|
|
|
|
|
253 |
except Exception as e:
|
254 |
+
st.error(f"❌ خطا در ساخت ایندکس: {e}")
|
255 |
|
256 |
|
257 |
|