Spaces:
Sleeping
Sleeping
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
from nltk.tokenize import sent_tokenize | |
# TODO: consider moving to helpers | |
def split_into_sentences(input_text: str) -> list[str]: | |
""" | |
Splits input text into sentences by newlines | |
and then tokenizes each paragraph into sentences. | |
Args: | |
input_text (str): The input text as a string. | |
Returns: | |
list: A list of sentences. | |
Returns an empty list if input is not a string. | |
""" | |
if not isinstance(input_text, str): | |
return [] | |
# Split the input text into paragraphs based on newline characters, | |
# keeping the newline characters. | |
paragraphs = input_text.splitlines(keepends=True) | |
sentences = [] | |
for paragraph in paragraphs: | |
# Remove leading/trailing whitespace | |
paragraph = paragraph.strip() | |
if paragraph and paragraph != "\n": | |
# Tokenize the paragraph into sentences | |
sentences.extend(sent_tokenize(paragraph)) | |
return sentences | |
def split_into_paragraphs(input_text: str) -> list[str]: | |
""" | |
Splits input text into paragraphs based on newline characters. | |
Args: | |
input_text (str): The input text as a string. | |
Returns: | |
list: A list of paragraphs. | |
Returns an empty list if input is not a string. | |
""" | |
if not isinstance(input_text, str): | |
return [] | |
# Split the input text into paragraphs based on newline characters, | |
# keeping the newline characters. | |
paragraphs = input_text.splitlines(keepends=True) | |
out_paragraphs = [] | |
for paragraph in paragraphs: | |
# Remove leading/trailing whitespace | |
paragraph = paragraph.strip() | |
if paragraph and paragraph != "\n": | |
# Append the cleaned paragraph to the output list. | |
out_paragraphs.append(paragraph) | |
return out_paragraphs | |