File size: 5,606 Bytes
7a0f03d
9fe2e05
 
defb0a9
128e483
b448191
dc99e66
 
 
 
 
b51fe95
9ebd8d9
5985f75
 
 
128e483
 
6564690
7a0f03d
dc99e66
 
b448191
128e483
 
 
6997dfd
dc99e66
41af8de
dc99e66
 
 
41af8de
dc99e66
 
 
41af8de
dc99e66
 
 
 
 
 
 
 
 
 
 
41af8de
 
 
6997dfd
41af8de
128e483
8d6e6bf
128e483
41af8de
128e483
 
 
 
41af8de
8d6e6bf
128e483
8d6e6bf
128e483
41af8de
dc99e66
6997dfd
dc99e66
41af8de
b448191
128e483
68eec95
 
db33911
 
 
 
5985f75
5054e30
5985f75
b7b439e
0c16d4c
 
5054e30
 
128e483
 
 
6997dfd
 
 
5054e30
6997dfd
b7b439e
5054e30
 
5985f75
 
f21cdf6
 
680827f
5985f75
680827f
 
 
 
 
 
5985f75
dc99e66
9fe2e05
 
 
 
 
 
7a0f03d
c9690b4
 
 
9fe2e05
7a0f03d
dc99e66
9fe2e05
 
 
 
 
 
5985f75
9fe2e05
 
128e483
 
 
7a0f03d
dc99e66
 
9587c62
 
9fe2e05
128e483
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import time
import streamlit as st
from langchain.chat_models import ChatOpenAI

from transformers import AutoTokenizer, AutoModel
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import torch
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from typing import List
from pydantic import Field
from sentence_transformers import SentenceTransformer
import numpy as np

# ----------------- تنظیمات صفحه -----------------
st.set_page_config(page_title="چت‌بات ارتش - فقط از PDF", page_icon="🪖", layout="wide")

# ----------------- بارگذاری مدل FarsiBERT -----------------
# model_name = "HooshvareLab/bert-fa-zwnj-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

# ----------------- لود PDF و ساخت ایندکس -----------------

@st.cache_resource
def build_pdf_index():
    with st.spinner('📄 در حال پردازش فایل PDF...'):
        # بارگذاری فایل
        loader = PyPDFLoader("test1.pdf")
        pages = loader.load()

        # تکه‌تکه کردن متن
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50
        )

        texts = []
        for page in pages:
            texts.extend(splitter.split_text(page.page_content))

        documents = [LangchainDocument(page_content=t) for t in texts]

        # مدل‌های Embedding
        sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

        embeddings = []

        # پروگرس بار اضافه می‌کنیم
        progress_bar = st.progress(0)
        total_docs = len(documents)

        # فقط از SentenceTransformer استفاده می‌کنیم
        for i, doc in enumerate(documents):
            batch_embedding = sentence_model.encode(doc.page_content, convert_to_numpy=True)
            embeddings.append(batch_embedding)

            # به‌روزرسانی پروگرس بار
            progress_bar.progress((i + 1) / total_docs)

        # اطمینان از اینکه خروجی NumpyArray است
        embeddings = np.array(embeddings)

        return documents, embeddings


# ----------------- تعریف LLM از Groq -----------------
# groq_api_key = "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"

# به جای OpenAI اینو بذار:
llm = ChatOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
)

# ----------------- تعریف SimpleRetriever -----------------
class SimpleRetriever(BaseRetriever):
    documents: List[Document] = Field(...)
    embeddings: List = Field(...)

    def _get_relevant_documents(self, query: str) -> List[Document]:
        # فقط از sentence_model استفاده می‌کنیم
        sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        query_embedding = sentence_model.encode(query, convert_to_numpy=True)

        similarities = []
        for doc_embedding in self.embeddings:
            similarity = (query_embedding * doc_embedding).sum()
            similarities.append(similarity)

        ranked_docs = sorted(zip(similarities, self.documents), reverse=True)
        return [doc for _, doc in ranked_docs[:5]]

# ----------------- ساخت Index -----------------
documents, embeddings = build_pdf_index()
retriever = SimpleRetriever(documents=documents, embeddings=embeddings)

# ----------------- ساخت Chain -----------------
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    input_key="question"
)

# ----------------- استیت برای چت -----------------
if 'messages' not in st.session_state:
    st.session_state.messages = []

if 'pending_prompt' not in st.session_state:
    st.session_state.pending_prompt = None

# ----------------- نمایش پیام‌های قبلی -----------------
for msg in st.session_state.messages:
    with st.chat_message(msg['role']):
        st.markdown(f"🗨️ {msg['content']}", unsafe_allow_html=True)

# ----------------- ورودی چت -----------------
prompt = st.chat_input("سوالی در مورد فایل بپرس...")

if prompt:
    st.session_state.messages.append({'role': 'user', 'content': prompt})
    st.session_state.pending_prompt = prompt
    st.rerun()

# ----------------- پاسخ مدل -----------------
if st.session_state.pending_prompt:
    with st.chat_message('ai'):
        thinking = st.empty()
        thinking.markdown("🤖 در حال فکر کردن از روی PDF...")

        try:
            response = chain.run(f"سوال: {st.session_state.pending_prompt}")
            answer = response.strip()
        except Exception as e:
            answer = f"خطا در پاسخ‌دهی: {str(e)}"

        thinking.empty()

        full_response = ""
        placeholder = st.empty()
        for word in answer.split():
            full_response += word + " "
            placeholder.markdown(full_response + "▌")
            time.sleep(0.03)

        placeholder.markdown(full_response)
        st.session_state.messages.append({'role': 'ai', 'content': full_response})
        st.session_state.pending_prompt = None