M17idd commited on
Commit
9912747
·
verified ·
1 Parent(s): 59076d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import time
 
2
  import streamlit as st
3
  from langchain.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -104,16 +105,19 @@ class TogetherEmbeddings(Embeddings):
104
  def embed_query(self, text: str) -> List[float]:
105
  return self.embed_documents([text])[0]
106
 
 
 
 
 
 
107
  @st.cache_resource
108
  def get_pdf_index():
109
  with st.spinner('📄 در حال پردازش فایل PDF...'):
110
- # لود فایل
111
  loader = [PyPDFLoader('test1.pdf')]
112
  pages = []
113
  for l in loader:
114
  pages.extend(l.load())
115
 
116
- # اول چانک کردن عادی با سایز 124
117
  splitter_initial = RecursiveCharacterTextSplitter(
118
  chunk_size=124,
119
  chunk_overlap=25
@@ -127,35 +131,35 @@ def get_pdf_index():
127
  else:
128
  small_chunks.append(text)
129
 
130
- # حالا چک کنیم هیچ چانکی بیشتر از 3000 کاراکتر نباشه
131
  final_chunks = []
132
- final_splitter = RecursiveCharacterTextSplitter(
133
- chunk_size=2000,
134
- chunk_overlap=200
135
- )
136
 
137
  for chunk in small_chunks:
138
- if len(chunk) > 2000:
139
- final_chunks.extend(final_splitter.split_text(chunk))
 
 
 
 
 
 
 
140
  else:
141
  final_chunks.append(chunk)
142
 
143
- # آماده سازی اسناد برای index
144
- from langchain.schema import Document as LangchainDocument
145
  documents = [LangchainDocument(page_content=text) for text in final_chunks]
146
 
147
- # ساختن embedding
148
  embeddings = TogetherEmbeddings(
149
  model_name="togethercomputer/m2-bert-80M-32k-retrieval",
150
  api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
151
  )
152
 
153
- # ساختن ایندکس
154
  return VectorstoreIndexCreator(
155
  embedding=embeddings,
156
  text_splitter=RecursiveCharacterTextSplitter(
157
- chunk_size=2000,
158
- chunk_overlap=200
159
  )
160
  ).from_documents(documents)
161
 
 
1
  import time
2
+ import tiktoken
3
  import streamlit as st
4
  from langchain.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
105
  def embed_query(self, text: str) -> List[float]:
106
  return self.embed_documents([text])[0]
107
 
108
+
109
+ def count_tokens(text, model_name="gpt-3.5-turbo"):
110
+ enc = tiktoken.encoding_for_model(model_name)
111
+ return len(enc.encode(text))
112
+
113
  @st.cache_resource
114
  def get_pdf_index():
115
  with st.spinner('📄 در حال پردازش فایل PDF...'):
 
116
  loader = [PyPDFLoader('test1.pdf')]
117
  pages = []
118
  for l in loader:
119
  pages.extend(l.load())
120
 
 
121
  splitter_initial = RecursiveCharacterTextSplitter(
122
  chunk_size=124,
123
  chunk_overlap=25
 
131
  else:
132
  small_chunks.append(text)
133
 
134
+ # حالا چک واقعی بر اساس تعداد توکن
135
  final_chunks = []
136
+ max_tokens = 2000 # حداکثر توکن مجاز برای Together
 
 
 
137
 
138
  for chunk in small_chunks:
139
+ token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
140
+ if token_count > max_tokens:
141
+ # اگر چانک بزرگ بود، خوردش کن
142
+ splitter_token_safe = RecursiveCharacterTextSplitter(
143
+ chunk_size=1000, # یا هر چیزی که مطمئن شی توکنش زیر 2000 میمونه
144
+ chunk_overlap=100
145
+ )
146
+ smaller_chunks = splitter_token_safe.split_text(chunk)
147
+ final_chunks.extend(smaller_chunks)
148
  else:
149
  final_chunks.append(chunk)
150
 
 
 
151
  documents = [LangchainDocument(page_content=text) for text in final_chunks]
152
 
 
153
  embeddings = TogetherEmbeddings(
154
  model_name="togethercomputer/m2-bert-80M-32k-retrieval",
155
  api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
156
  )
157
 
 
158
  return VectorstoreIndexCreator(
159
  embedding=embeddings,
160
  text_splitter=RecursiveCharacterTextSplitter(
161
+ chunk_size=1000,
162
+ chunk_overlap=100
163
  )
164
  ).from_documents(documents)
165