Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import time
|
|
|
2 |
import streamlit as st
|
3 |
from langchain.document_loaders import PyPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -104,16 +105,19 @@ class TogetherEmbeddings(Embeddings):
|
|
104 |
def embed_query(self, text: str) -> List[float]:
|
105 |
return self.embed_documents([text])[0]
|
106 |
|
|
|
|
|
|
|
|
|
|
|
107 |
@st.cache_resource
|
108 |
def get_pdf_index():
|
109 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
110 |
-
# لود فایل
|
111 |
loader = [PyPDFLoader('test1.pdf')]
|
112 |
pages = []
|
113 |
for l in loader:
|
114 |
pages.extend(l.load())
|
115 |
|
116 |
-
# اول چانک کردن عادی با سایز 124
|
117 |
splitter_initial = RecursiveCharacterTextSplitter(
|
118 |
chunk_size=124,
|
119 |
chunk_overlap=25
|
@@ -127,35 +131,35 @@ def get_pdf_index():
|
|
127 |
else:
|
128 |
small_chunks.append(text)
|
129 |
|
130 |
-
# حالا چک
|
131 |
final_chunks = []
|
132 |
-
|
133 |
-
chunk_size=2000,
|
134 |
-
chunk_overlap=200
|
135 |
-
)
|
136 |
|
137 |
for chunk in small_chunks:
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
else:
|
141 |
final_chunks.append(chunk)
|
142 |
|
143 |
-
# آماده سازی اسناد برای index
|
144 |
-
from langchain.schema import Document as LangchainDocument
|
145 |
documents = [LangchainDocument(page_content=text) for text in final_chunks]
|
146 |
|
147 |
-
# ساختن embedding
|
148 |
embeddings = TogetherEmbeddings(
|
149 |
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
150 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
151 |
)
|
152 |
|
153 |
-
# ساختن ایندکس
|
154 |
return VectorstoreIndexCreator(
|
155 |
embedding=embeddings,
|
156 |
text_splitter=RecursiveCharacterTextSplitter(
|
157 |
-
chunk_size=
|
158 |
-
chunk_overlap=
|
159 |
)
|
160 |
).from_documents(documents)
|
161 |
|
|
|
1 |
import time
|
2 |
+
import tiktoken
|
3 |
import streamlit as st
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
105 |
def embed_query(self, text: str) -> List[float]:
|
106 |
return self.embed_documents([text])[0]
|
107 |
|
108 |
+
|
109 |
+
def count_tokens(text, model_name="gpt-3.5-turbo"):
|
110 |
+
enc = tiktoken.encoding_for_model(model_name)
|
111 |
+
return len(enc.encode(text))
|
112 |
+
|
113 |
@st.cache_resource
|
114 |
def get_pdf_index():
|
115 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
|
|
116 |
loader = [PyPDFLoader('test1.pdf')]
|
117 |
pages = []
|
118 |
for l in loader:
|
119 |
pages.extend(l.load())
|
120 |
|
|
|
121 |
splitter_initial = RecursiveCharacterTextSplitter(
|
122 |
chunk_size=124,
|
123 |
chunk_overlap=25
|
|
|
131 |
else:
|
132 |
small_chunks.append(text)
|
133 |
|
134 |
+
# حالا چک واقعی بر اساس تعداد توکن
|
135 |
final_chunks = []
|
136 |
+
max_tokens = 2000 # حداکثر توکن مجاز برای Together
|
|
|
|
|
|
|
137 |
|
138 |
for chunk in small_chunks:
|
139 |
+
token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
|
140 |
+
if token_count > max_tokens:
|
141 |
+
# اگر چانک بزرگ بود، خوردش کن
|
142 |
+
splitter_token_safe = RecursiveCharacterTextSplitter(
|
143 |
+
chunk_size=1000, # یا هر چیزی که مطمئن شی توکنش زیر 2000 میمونه
|
144 |
+
chunk_overlap=100
|
145 |
+
)
|
146 |
+
smaller_chunks = splitter_token_safe.split_text(chunk)
|
147 |
+
final_chunks.extend(smaller_chunks)
|
148 |
else:
|
149 |
final_chunks.append(chunk)
|
150 |
|
|
|
|
|
151 |
documents = [LangchainDocument(page_content=text) for text in final_chunks]
|
152 |
|
|
|
153 |
embeddings = TogetherEmbeddings(
|
154 |
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
155 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
156 |
)
|
157 |
|
|
|
158 |
return VectorstoreIndexCreator(
|
159 |
embedding=embeddings,
|
160 |
text_splitter=RecursiveCharacterTextSplitter(
|
161 |
+
chunk_size=1000,
|
162 |
+
chunk_overlap=100
|
163 |
)
|
164 |
).from_documents(documents)
|
165 |
|