M17idd commited on
Commit
95cb532
·
verified ·
1 Parent(s): 20419dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -29,12 +29,10 @@ st.set_page_config(page_title="چت‌بات ارتش - فقط از PDF", page_i
29
 
30
  @st.cache_resource
31
  def build_pdf_index():
32
- with st.spinner('📄 در حال پردازش فایل PDF...'):
33
- # بارگذاری فایل
34
  loader = PyPDFLoader("test1.pdf")
35
  pages = loader.load()
36
 
37
- # تکه‌تکه کردن متن
38
  splitter = RecursiveCharacterTextSplitter(
39
  chunk_size=500,
40
  chunk_overlap=50
@@ -46,25 +44,20 @@ def build_pdf_index():
46
 
47
  documents = [LangchainDocument(page_content=t) for t in texts]
48
 
49
- # مدل Embedding
50
  sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
51
 
52
- # پروگرس بار
53
  progress_bar = st.progress(0)
54
  total_docs = len(documents)
55
 
56
- # آماده‌سازی داده‌ها
57
  texts_to_encode = [doc.page_content for doc in documents]
58
 
59
- # انکود بچی
60
- batch_size = 32 # سایز دلخواه
61
  embeddings = []
62
  for i in range(0, total_docs, batch_size):
63
  batch_texts = texts_to_encode[i:i+batch_size]
64
  batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
65
  embeddings.extend(batch_embeddings)
66
 
67
- # بروزرسانی پروگرس بار
68
  progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
69
 
70
  embeddings = np.array(embeddings)
@@ -96,7 +89,11 @@ class SimpleRetriever(BaseRetriever):
96
  similarity = (query_embedding * doc_embedding).sum()
97
  similarities.append(similarity)
98
 
99
- ranked_docs = sorted(zip(similarities, self.documents), reverse=True)
 
 
 
 
100
  return [doc for _, doc in ranked_docs[:5]]
101
 
102
  # ----------------- ساخت Index -----------------
@@ -135,7 +132,7 @@ if prompt:
135
  if st.session_state.pending_prompt:
136
  with st.chat_message('ai'):
137
  thinking = st.empty()
138
- thinking.markdown("🤖 در حال فکر کردن از روی PDF...")
139
 
140
  try:
141
  response = chain.run(f"سوال: {st.session_state.pending_prompt}")
 
29
 
30
  @st.cache_resource
31
  def build_pdf_index():
32
+ with st.spinner('📄 در حال پردازش فایل ...'):
 
33
  loader = PyPDFLoader("test1.pdf")
34
  pages = loader.load()
35
 
 
36
  splitter = RecursiveCharacterTextSplitter(
37
  chunk_size=500,
38
  chunk_overlap=50
 
44
 
45
  documents = [LangchainDocument(page_content=t) for t in texts]
46
 
 
47
  sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
48
 
 
49
  progress_bar = st.progress(0)
50
  total_docs = len(documents)
51
 
 
52
  texts_to_encode = [doc.page_content for doc in documents]
53
 
54
+ batch_size = 128
 
55
  embeddings = []
56
  for i in range(0, total_docs, batch_size):
57
  batch_texts = texts_to_encode[i:i+batch_size]
58
  batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
59
  embeddings.extend(batch_embeddings)
60
 
 
61
  progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
62
 
63
  embeddings = np.array(embeddings)
 
89
  similarity = (query_embedding * doc_embedding).sum()
90
  similarities.append(similarity)
91
 
92
+ ranked_docs = sorted(
93
+ zip(similarities, self.documents),
94
+ key=lambda x: x[0],
95
+ reverse=True
96
+ )
97
  return [doc for _, doc in ranked_docs[:5]]
98
 
99
  # ----------------- ساخت Index -----------------
 
132
  if st.session_state.pending_prompt:
133
  with st.chat_message('ai'):
134
  thinking = st.empty()
135
+ thinking.markdown("🤖 در حال فکر کردن ...")
136
 
137
  try:
138
  response = chain.run(f"سوال: {st.session_state.pending_prompt}")