Update app.py
Browse files
app.py
CHANGED
@@ -707,6 +707,10 @@ def sentence_splitter_instantiation(
|
|
707 |
sentence_splitter_config,
|
708 |
):
|
709 |
### Chunker/Sentence Splitter
|
|
|
|
|
|
|
|
|
710 |
if sentence_splitter_config.value is not None:
|
711 |
sentence_splitter_config_values = sentence_splitter_config.value
|
712 |
validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
|
@@ -719,6 +723,7 @@ def sentence_splitter_instantiation(
|
|
719 |
paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
|
720 |
secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
|
721 |
include_metadata=sentence_splitter_config_values.get("include_metadata"),
|
|
|
722 |
)
|
723 |
|
724 |
else:
|
@@ -729,6 +734,7 @@ def sentence_splitter_instantiation(
|
|
729 |
paragraph_separator="\n\n\n",
|
730 |
secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
|
731 |
include_metadata=True,
|
|
|
732 |
)
|
733 |
return (sentence_splitter,)
|
734 |
|
|
|
707 |
sentence_splitter_config,
|
708 |
):
|
709 |
### Chunker/Sentence Splitter
|
710 |
+
def simple_whitespace_tokenizer(text):
|
711 |
+
"""Tokenizer that considers each word as one token"""
|
712 |
+
return text.split()
|
713 |
+
|
714 |
if sentence_splitter_config.value is not None:
|
715 |
sentence_splitter_config_values = sentence_splitter_config.value
|
716 |
validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
|
|
|
723 |
paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
|
724 |
secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
|
725 |
include_metadata=sentence_splitter_config_values.get("include_metadata"),
|
726 |
+
tokenizer=simple_whitespace_tokenizer
|
727 |
)
|
728 |
|
729 |
else:
|
|
|
734 |
paragraph_separator="\n\n\n",
|
735 |
secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
|
736 |
include_metadata=True,
|
737 |
+
tokenizer=simple_whitespace_tokenizer
|
738 |
)
|
739 |
return (sentence_splitter,)
|
740 |
|