Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,8 @@ from typing import List
|
|
13 |
from pydantic import Field
|
14 |
from sentence_transformers import SentenceTransformer
|
15 |
import numpy as np
|
|
|
|
|
16 |
import faiss
|
17 |
|
18 |
# ----------------- تنظیمات صفحه -----------------
|
@@ -99,47 +101,22 @@ st.markdown("""
|
|
99 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
100 |
|
101 |
@st.cache_resource
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
splitter = RecursiveCharacterTextSplitter(
|
108 |
-
chunk_size=128,
|
109 |
-
chunk_overlap=50
|
110 |
-
)
|
111 |
-
|
112 |
-
texts = []
|
113 |
-
for page in pages:
|
114 |
-
texts.extend(splitter.split_text(page.page_content))
|
115 |
-
|
116 |
-
documents = [LangchainDocument(page_content=t) for t in texts]
|
117 |
-
|
118 |
-
sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
|
119 |
-
|
120 |
-
progress_bar = st.progress(0)
|
121 |
-
total_docs = len(documents)
|
122 |
-
|
123 |
-
texts_to_encode = [doc.page_content for doc in documents]
|
124 |
-
|
125 |
-
batch_size = 128
|
126 |
-
embeddings = []
|
127 |
-
for i in range(0, total_docs, batch_size):
|
128 |
-
batch_texts = texts_to_encode[i:i+batch_size]
|
129 |
-
batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
|
130 |
-
embeddings.extend(batch_embeddings)
|
131 |
|
132 |
-
|
|
|
133 |
|
134 |
-
|
135 |
-
time.sleep(5)
|
136 |
-
progress_bar.empty()
|
137 |
-
embeddings = np.array(embeddings)
|
138 |
|
139 |
-
|
140 |
-
|
|
|
|
|
141 |
|
142 |
-
return
|
143 |
|
144 |
# ----------------- تعریف LLM از Groq -----------------
|
145 |
llm = ChatOpenAI(
|
|
|
13 |
from pydantic import Field
|
14 |
from sentence_transformers import SentenceTransformer
|
15 |
import numpy as np
|
16 |
+
from langchain.vectorstores import VectorstoreIndexCreator
|
17 |
+
from sentence_transformers import SentenceTransformer
|
18 |
import faiss
|
19 |
|
20 |
# ----------------- تنظیمات صفحه -----------------
|
|
|
101 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
102 |
|
103 |
@st.cache_resource
|
104 |
+
@st.cache_resource
|
105 |
+
def get_pdf_index():
|
106 |
+
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
107 |
+
loader = [PyPDFLoader('test1.pdf')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
model_name = "togethercomputer/m2-bert-80M-8k-retrieval"
|
110 |
+
model = SentenceTransformer(model_name)
|
111 |
|
112 |
+
embeddings = model.encode
|
|
|
|
|
|
|
113 |
|
114 |
+
index_creator = VectorstoreIndexCreator(
|
115 |
+
embedding=embeddings,
|
116 |
+
text_splitter=RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
117 |
+
)
|
118 |
|
119 |
+
return index_creator.from_loaders(loader)
|
120 |
|
121 |
# ----------------- تعریف LLM از Groq -----------------
|
122 |
llm = ChatOpenAI(
|