File size: 3,947 Bytes
9fe2e05
d287817
dc99e66
 
60c5e64
b2c45d8
217583a
b2c45d8
217583a
d287817
 
 
 
 
 
 
 
 
e3f5de5
d287817
8f34ab2
d287817
 
 
8f34ab2
d287817
8f34ab2
 
d287817
 
8f34ab2
d287817
 
 
8f34ab2
d287817
 
 
 
 
8f34ab2
d287817
8f34ab2
d287817
 
8f34ab2
d287817
 
 
886079d
d287817
5606c57
 
 
b2c45d8
49a9882
 
d287817
 
b2c45d8
 
 
d287817
b2c45d8
 
5985f75
d287817
9fe2e05
 
 
 
 
 
d287817
 
 
c9690b4
 
 
9fe2e05
d287817
9fe2e05
 
 
 
 
 
 
 
128e483
b2c45d8
128e483
d287817
 
b2c45d8
b8fecc5
b2c45d8
9fe2e05
128e483
 
 
f10d3cb
b8fecc5
128e483
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
import time
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_together import TogetherEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# --- 📄 ساخت امبدینگ‌ها با batch 50 تایی
def batch_embed(texts, embeddings_model, batch_size=50):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embs = embeddings_model.embed_documents([doc.page_content for doc in batch])
        all_embeddings.extend(embs)
    return all_embeddings

@st.cache_resource
def load_chunks_and_embeddings():
    pdf_loader = PyPDFLoader('test1.pdf')
    pages = pdf_loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
    docs = text_splitter.split_documents(pages)

    embeddings = TogetherEmbeddings(
        api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
    )
    
    vectorstore = FAISS.from_documents([], embedding=embeddings)  # اول خالی

    # پروگرس بار
    progress = st.progress(0, text="🔄 در حال پردازش چانک‌ها...")
    total = len(docs)

    batch_size = 50
    for i in range(0, total, batch_size):
        batch_docs = docs[i:i+batch_size]
        embeddings_batch = embeddings.embed_documents([doc.page_content for doc in batch_docs])
        vectorstore.add_embeddings(embeddings_batch, batch_docs)

        progress.progress(min((i+batch_size)/total, 1.0))

    progress.empty()
    return vectorstore

# --- 🛠️ آماده کردن دیتابیس
with st.spinner("📚 در حال بارگذاری فایل و ساخت امبدینگ‌ها... لطفا صبور باشید"):
    vectorstore = load_chunks_and_embeddings()

# --- 🤖 آماده سازی مدل LLM
llm = ChatOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
    model="meta-llama/Llama-3-70B-Instruct-Turbo-Free"
)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    input_key='question'
)

# --- 💬 چت بات
if 'messages' not in st.session_state:
    st.session_state.messages = []

if 'pending_prompt' not in st.session_state:
    st.session_state.pending_prompt = None

st.title("📄🤖 دستیار PDF شما")

# نمایش تاریخچه گفتگو
for msg in st.session_state.messages:
    with st.chat_message(msg['role']):
        st.markdown(f"🗨️ {msg['content']}", unsafe_allow_html=True)

prompt = st.chat_input("سوالی از PDF داری؟")

if prompt:
    st.session_state.messages.append({'role': 'user', 'content': prompt})
    st.session_state.pending_prompt = prompt
    st.rerun()

if st.session_state.pending_prompt:
    with st.chat_message('ai'):
        thinking = st.empty()
        thinking.markdown("🤖 در حال فکر کردن...")

        # اجرای جستجو در ایندکس
        response = chain.run(f'فقط به زبان فارسی جواب بده. سوال: {st.session_state.pending_prompt}')
        answer = response.split("Helpful Answer:")[-1].strip()
        if not answer:
            answer = "متأسفم، اطلاعات دقیقی در این مورد ندارم."

        thinking.empty()
        full_response = ""
        placeholder = st.empty()

        for word in answer.split():
            full_response += word + " "
            placeholder.markdown(full_response + "▌")
            time.sleep(0.03)

        placeholder.markdown(full_response)
        st.session_state.messages.append({'role': 'ai', 'content': full_response})
        st.session_state.pending_prompt = None