MilanM commited on
Commit
9ad01a7
·
verified ·
1 Parent(s): 8fa1de9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -0
app.py CHANGED
@@ -707,6 +707,10 @@ def sentence_splitter_instantiation(
707
  sentence_splitter_config,
708
  ):
709
  ### Chunker/Sentence Splitter
 
 
 
 
710
  if sentence_splitter_config.value is not None:
711
  sentence_splitter_config_values = sentence_splitter_config.value
712
  validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
@@ -719,6 +723,7 @@ def sentence_splitter_instantiation(
719
  paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
720
  secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
721
  include_metadata=sentence_splitter_config_values.get("include_metadata"),
 
722
  )
723
 
724
  else:
@@ -729,6 +734,7 @@ def sentence_splitter_instantiation(
729
  paragraph_separator="\n\n\n",
730
  secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
731
  include_metadata=True,
 
732
  )
733
  return (sentence_splitter,)
734
 
 
707
  sentence_splitter_config,
708
  ):
709
  ### Chunker/Sentence Splitter
710
+ def simple_whitespace_tokenizer(text):
711
+ """Tokenizer that considers each word as one token"""
712
+ return text.split()
713
+
714
  if sentence_splitter_config.value is not None:
715
  sentence_splitter_config_values = sentence_splitter_config.value
716
  validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
 
723
  paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
724
  secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
725
  include_metadata=sentence_splitter_config_values.get("include_metadata"),
726
+ tokenizer=simple_whitespace_tokenizer
727
  )
728
 
729
  else:
 
734
  paragraph_separator="\n\n\n",
735
  secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
736
  include_metadata=True,
737
+ tokenizer=simple_whitespace_tokenizer
738
  )
739
  return (sentence_splitter,)
740