Update app.py
Browse files
app.py
CHANGED
@@ -29,12 +29,10 @@ st.set_page_config(page_title="چتبات ارتش - فقط از PDF", page_i
|
|
29 |
|
30 |
@st.cache_resource
|
31 |
def build_pdf_index():
|
32 |
-
with st.spinner('📄 در حال پردازش فایل
|
33 |
-
# بارگذاری فایل
|
34 |
loader = PyPDFLoader("test1.pdf")
|
35 |
pages = loader.load()
|
36 |
|
37 |
-
# تکهتکه کردن متن
|
38 |
splitter = RecursiveCharacterTextSplitter(
|
39 |
chunk_size=500,
|
40 |
chunk_overlap=50
|
@@ -46,25 +44,20 @@ def build_pdf_index():
|
|
46 |
|
47 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
48 |
|
49 |
-
# مدل Embedding
|
50 |
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
51 |
|
52 |
-
# پروگرس بار
|
53 |
progress_bar = st.progress(0)
|
54 |
total_docs = len(documents)
|
55 |
|
56 |
-
# آمادهسازی دادهها
|
57 |
texts_to_encode = [doc.page_content for doc in documents]
|
58 |
|
59 |
-
|
60 |
-
batch_size = 32 # سایز دلخواه
|
61 |
embeddings = []
|
62 |
for i in range(0, total_docs, batch_size):
|
63 |
batch_texts = texts_to_encode[i:i+batch_size]
|
64 |
batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
|
65 |
embeddings.extend(batch_embeddings)
|
66 |
|
67 |
-
# بروزرسانی پروگرس بار
|
68 |
progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
|
69 |
|
70 |
embeddings = np.array(embeddings)
|
@@ -96,7 +89,11 @@ class SimpleRetriever(BaseRetriever):
|
|
96 |
similarity = (query_embedding * doc_embedding).sum()
|
97 |
similarities.append(similarity)
|
98 |
|
99 |
-
ranked_docs = sorted(
|
|
|
|
|
|
|
|
|
100 |
return [doc for _, doc in ranked_docs[:5]]
|
101 |
|
102 |
# ----------------- ساخت Index -----------------
|
@@ -135,7 +132,7 @@ if prompt:
|
|
135 |
if st.session_state.pending_prompt:
|
136 |
with st.chat_message('ai'):
|
137 |
thinking = st.empty()
|
138 |
-
thinking.markdown("🤖 در حال فکر کردن
|
139 |
|
140 |
try:
|
141 |
response = chain.run(f"سوال: {st.session_state.pending_prompt}")
|
|
|
29 |
|
30 |
@st.cache_resource
|
31 |
def build_pdf_index():
|
32 |
+
with st.spinner('📄 در حال پردازش فایل ...'):
|
|
|
33 |
loader = PyPDFLoader("test1.pdf")
|
34 |
pages = loader.load()
|
35 |
|
|
|
36 |
splitter = RecursiveCharacterTextSplitter(
|
37 |
chunk_size=500,
|
38 |
chunk_overlap=50
|
|
|
44 |
|
45 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
46 |
|
|
|
47 |
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
48 |
|
|
|
49 |
progress_bar = st.progress(0)
|
50 |
total_docs = len(documents)
|
51 |
|
|
|
52 |
texts_to_encode = [doc.page_content for doc in documents]
|
53 |
|
54 |
+
batch_size = 128
|
|
|
55 |
embeddings = []
|
56 |
for i in range(0, total_docs, batch_size):
|
57 |
batch_texts = texts_to_encode[i:i+batch_size]
|
58 |
batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
|
59 |
embeddings.extend(batch_embeddings)
|
60 |
|
|
|
61 |
progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
|
62 |
|
63 |
embeddings = np.array(embeddings)
|
|
|
89 |
similarity = (query_embedding * doc_embedding).sum()
|
90 |
similarities.append(similarity)
|
91 |
|
92 |
+
ranked_docs = sorted(
|
93 |
+
zip(similarities, self.documents),
|
94 |
+
key=lambda x: x[0],
|
95 |
+
reverse=True
|
96 |
+
)
|
97 |
return [doc for _, doc in ranked_docs[:5]]
|
98 |
|
99 |
# ----------------- ساخت Index -----------------
|
|
|
132 |
if st.session_state.pending_prompt:
|
133 |
with st.chat_message('ai'):
|
134 |
thinking = st.empty()
|
135 |
+
thinking.markdown("🤖 در حال فکر کردن ...")
|
136 |
|
137 |
try:
|
138 |
response = chain.run(f"سوال: {st.session_state.pending_prompt}")
|