eagle0504 commited on
Commit
efce82a
·
1 Parent(s): 6569632

ap token_size updated

Browse files
Files changed (1) hide show
  1. app.py +5 -1
app.py CHANGED
@@ -167,6 +167,10 @@ def main():
167
 
168
  # File uploader widget
169
  if uploaded_file is not None:
 
 
 
 
170
  # To read file as bytes:
171
  bytes_data = uploaded_file.getvalue()
172
  st.success("Your PDF is uploaded successfully.")
@@ -199,7 +203,7 @@ def main():
199
  # Tokenize it
200
  st.warning("Start tokenzing ...")
201
  token_splitter = SentenceTransformersTokenTextSplitter(
202
- chunk_overlap=0, tokens_per_chunk=20
203
  )
204
  token_split_texts = []
205
  for text in character_split_texts:
 
167
 
168
  # File uploader widget
169
  if uploaded_file is not None:
170
+ # Select token size:
171
+ st.sidebar.success("Note: 1 Token ~ 4 Characters.")
172
+ token_size = st.sidebar.slider('Select a token size (when we scrape the document)', 5, 150, 20)
173
+
174
  # To read file as bytes:
175
  bytes_data = uploaded_file.getvalue()
176
  st.success("Your PDF is uploaded successfully.")
 
203
  # Tokenize it
204
  st.warning("Start tokenzing ...")
205
  token_splitter = SentenceTransformersTokenTextSplitter(
206
+ chunk_overlap=5, tokens_per_chunk=token_size
207
  )
208
  token_split_texts = []
209
  for text in character_split_texts: