ap token_size updated
Browse files
app.py
CHANGED
@@ -167,6 +167,10 @@ def main():
|
|
167 |
|
168 |
# File uploader widget
|
169 |
if uploaded_file is not None:
|
|
|
|
|
|
|
|
|
170 |
# To read file as bytes:
|
171 |
bytes_data = uploaded_file.getvalue()
|
172 |
st.success("Your PDF is uploaded successfully.")
|
@@ -199,7 +203,7 @@ def main():
|
|
199 |
# Tokenize it
|
200 |
st.warning("Start tokenzing ...")
|
201 |
token_splitter = SentenceTransformersTokenTextSplitter(
|
202 |
-
chunk_overlap=
|
203 |
)
|
204 |
token_split_texts = []
|
205 |
for text in character_split_texts:
|
|
|
167 |
|
168 |
# File uploader widget
|
169 |
if uploaded_file is not None:
|
170 |
+
# Select token size:
|
171 |
+
st.sidebar.success("Note: 1 Token ~ 4 Characters.")
|
172 |
+
token_size = st.sidebar.slider('Select a token size (when we scrape the document)', 5, 150, 20)
|
173 |
+
|
174 |
# To read file as bytes:
|
175 |
bytes_data = uploaded_file.getvalue()
|
176 |
st.success("Your PDF is uploaded successfully.")
|
|
|
203 |
# Tokenize it
|
204 |
st.warning("Start tokenzing ...")
|
205 |
token_splitter = SentenceTransformersTokenTextSplitter(
|
206 |
+
chunk_overlap=5, tokens_per_chunk=token_size
|
207 |
)
|
208 |
token_split_texts = []
|
209 |
for text in character_split_texts:
|