Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,7 @@ from langchain.chat_models import ChatOpenAI
|
|
11 |
from typing import List
|
12 |
from together import Together
|
13 |
|
14 |
-
|
15 |
-
import tiktoken
|
16 |
-
from langchain.document_loaders import PyPDFLoader
|
17 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
-
from langchain.vectorstores import VectorstoreIndexCreator
|
19 |
from langchain.embeddings import TogetherEmbeddings
|
20 |
from langchain.schema import Document as LangchainDocument
|
21 |
|
@@ -140,16 +136,14 @@ def get_pdf_index():
|
|
140 |
else:
|
141 |
small_chunks.append(text)
|
142 |
|
143 |
-
# حالا چک واقعی بر اساس تعداد توکن
|
144 |
final_chunks = []
|
145 |
-
max_tokens = 2000
|
146 |
|
147 |
for chunk in small_chunks:
|
148 |
token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
|
149 |
if token_count > max_tokens:
|
150 |
-
# اگر چانک بزرگ بود، خوردش کن
|
151 |
splitter_token_safe = RecursiveCharacterTextSplitter(
|
152 |
-
chunk_size=1000,
|
153 |
chunk_overlap=100
|
154 |
)
|
155 |
smaller_chunks = splitter_token_safe.split_text(chunk)
|
@@ -164,13 +158,10 @@ def get_pdf_index():
|
|
164 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
165 |
)
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
chunk_overlap=100
|
172 |
-
)
|
173 |
-
).from_documents(documents)
|
174 |
|
175 |
index = get_pdf_index()
|
176 |
|
|
|
11 |
from typing import List
|
12 |
from together import Together
|
13 |
|
14 |
+
|
|
|
|
|
|
|
|
|
15 |
from langchain.embeddings import TogetherEmbeddings
|
16 |
from langchain.schema import Document as LangchainDocument
|
17 |
|
|
|
136 |
else:
|
137 |
small_chunks.append(text)
|
138 |
|
|
|
139 |
final_chunks = []
|
140 |
+
max_tokens = 2000
|
141 |
|
142 |
for chunk in small_chunks:
|
143 |
token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
|
144 |
if token_count > max_tokens:
|
|
|
145 |
splitter_token_safe = RecursiveCharacterTextSplitter(
|
146 |
+
chunk_size=1000,
|
147 |
chunk_overlap=100
|
148 |
)
|
149 |
smaller_chunks = splitter_token_safe.split_text(chunk)
|
|
|
158 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
159 |
)
|
160 |
|
161 |
+
# اینجا دیگه Vectorstore مستقیم میسازیم با FAISS
|
162 |
+
vectordb = FAISS.from_documents(documents, embedding=embeddings)
|
163 |
+
|
164 |
+
return vectordb
|
|
|
|
|
|
|
165 |
|
166 |
index = get_pdf_index()
|
167 |
|