Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import streamlit as st
|
2 |
import time
|
|
|
|
|
3 |
import numpy as np
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
-
from
|
7 |
from langchain.chat_models import ChatOpenAI
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
|
@@ -11,22 +13,45 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
11 |
|
12 |
@st.cache_resource
|
13 |
def load_chunks_and_embeddings():
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
)
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
chunk_texts, chunk_embeddings, embeddings_model = load_chunks_and_embeddings()
|
32 |
|
@@ -38,27 +63,28 @@ llm = ChatOpenAI(
|
|
38 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
39 |
)
|
40 |
|
41 |
-
# ------------------
|
42 |
|
43 |
def answer_from_pdf(question):
|
44 |
-
# ۱-
|
45 |
question_embedding = embeddings_model.embed_query(question)
|
46 |
|
47 |
-
# ۲- شباهت
|
48 |
similarities = cosine_similarity(
|
49 |
[question_embedding],
|
50 |
chunk_embeddings
|
51 |
-
)
|
52 |
|
53 |
-
# ۳-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
# ۴- ساخت پرامپت
|
58 |
-
|
|
|
59 |
|
60 |
متن:
|
61 |
-
{
|
62 |
|
63 |
سوال:
|
64 |
{question}
|
@@ -70,7 +96,7 @@ def answer_from_pdf(question):
|
|
70 |
|
71 |
# ------------------ Chat Streamlit UI ------------------
|
72 |
|
73 |
-
st.title('📚 چت با PDF')
|
74 |
|
75 |
if 'messages' not in st.session_state:
|
76 |
st.session_state.messages = []
|
@@ -97,7 +123,7 @@ if st.session_state.pending_prompt:
|
|
97 |
thinking = st.empty()
|
98 |
thinking.markdown("🤖 در حال پردازش...")
|
99 |
|
100 |
-
# پاسخ بر اساس نزدیکترین
|
101 |
response = answer_from_pdf(st.session_state.pending_prompt)
|
102 |
answer = response.strip()
|
103 |
if not answer:
|
@@ -116,3 +142,4 @@ if st.session_state.pending_prompt:
|
|
116 |
placeholder.markdown(full_response)
|
117 |
st.session_state.messages.append({'role': 'ai', 'content': full_response})
|
118 |
st.session_state.pending_prompt = None
|
|
|
|
1 |
import streamlit as st
|
2 |
import time
|
3 |
+
import os
|
4 |
+
import pickle
|
5 |
import numpy as np
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from langchain.embeddings import TogetherEmbeddings
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
|
|
|
13 |
|
14 |
@st.cache_resource
|
15 |
def load_chunks_and_embeddings():
|
16 |
+
embeddings_file = 'embeddings.pkl'
|
17 |
+
|
18 |
+
if os.path.exists(embeddings_file):
|
19 |
+
st.success("✅ امبدینگها از فایل کش بارگذاری شد.")
|
20 |
+
with open(embeddings_file, 'rb') as f:
|
21 |
+
data = pickle.load(f)
|
22 |
+
return data['chunk_texts'], data['chunk_embeddings'], data['embeddings_model']
|
23 |
+
|
24 |
+
else:
|
25 |
+
with st.spinner('📄 در حال پردازش PDF و ساخت امبدینگها...'):
|
26 |
+
loader = PyPDFLoader('test1.pdf')
|
27 |
+
pages = loader.load()
|
28 |
+
|
29 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
30 |
+
chunks = splitter.split_documents(pages)
|
31 |
+
|
32 |
+
embeddings_model = TogetherEmbeddings(
|
33 |
+
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
34 |
+
)
|
35 |
+
|
36 |
+
chunk_texts = [chunk.page_content for chunk in chunks]
|
37 |
+
|
38 |
+
# ساخت امبدینگ با progress bar
|
39 |
+
progress = st.progress(0, text="در حال ساخت امبدینگ چانکها...")
|
40 |
+
chunk_embeddings = []
|
41 |
+
for i, text in enumerate(chunk_texts):
|
42 |
+
chunk_embeddings.append(embeddings_model.embed_query(text))
|
43 |
+
progress.progress((i + 1) / len(chunk_texts))
|
44 |
+
|
45 |
+
# ذخیره در فایل کش
|
46 |
+
with open(embeddings_file, 'wb') as f:
|
47 |
+
pickle.dump({
|
48 |
+
'chunk_texts': chunk_texts,
|
49 |
+
'chunk_embeddings': chunk_embeddings,
|
50 |
+
'embeddings_model': embeddings_model,
|
51 |
+
}, f)
|
52 |
+
|
53 |
+
st.success(f"✅ {len(chunk_texts)} چانک پردازش و ذخیره شد.")
|
54 |
+
return chunk_texts, chunk_embeddings, embeddings_model
|
55 |
|
56 |
chunk_texts, chunk_embeddings, embeddings_model = load_chunks_and_embeddings()
|
57 |
|
|
|
63 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
64 |
)
|
65 |
|
66 |
+
# ------------------ پاسخ بر اساس ۱۰ چانک نزدیک ------------------
|
67 |
|
68 |
def answer_from_pdf(question):
|
69 |
+
# ۱- ساخت امبدینگ سوال
|
70 |
question_embedding = embeddings_model.embed_query(question)
|
71 |
|
72 |
+
# ۲- محاسبه شباهت
|
73 |
similarities = cosine_similarity(
|
74 |
[question_embedding],
|
75 |
chunk_embeddings
|
76 |
+
)[0]
|
77 |
|
78 |
+
# ۳- انتخاب ۱۰ چانک نزدیک
|
79 |
+
top_indices = np.argsort(similarities)[-10:][::-1]
|
80 |
+
selected_chunks = [chunk_texts[i] for i in top_indices]
|
81 |
|
82 |
# ۴- ساخت پرامپت
|
83 |
+
context = "\n\n".join(selected_chunks)
|
84 |
+
prompt = f"""با توجه به متن زیر فقط به زبان فارسی پاسخ بده:
|
85 |
|
86 |
متن:
|
87 |
+
{context}
|
88 |
|
89 |
سوال:
|
90 |
{question}
|
|
|
96 |
|
97 |
# ------------------ Chat Streamlit UI ------------------
|
98 |
|
99 |
+
st.title('📚 چت با PDF (با ۱۰ چانک نزدیک و کش شده)')
|
100 |
|
101 |
if 'messages' not in st.session_state:
|
102 |
st.session_state.messages = []
|
|
|
123 |
thinking = st.empty()
|
124 |
thinking.markdown("🤖 در حال پردازش...")
|
125 |
|
126 |
+
# پاسخ بر اساس نزدیکترین چانکها
|
127 |
response = answer_from_pdf(st.session_state.pending_prompt)
|
128 |
answer = response.strip()
|
129 |
if not answer:
|
|
|
142 |
placeholder.markdown(full_response)
|
143 |
st.session_state.messages.append({'role': 'ai', 'content': full_response})
|
144 |
st.session_state.pending_prompt = None
|
145 |
+
|