pmkhanh7890's picture
Add comments to text module
0827f9d
raw
history blame
1.87 kB
"""
Author: Khanh Phan
Date: 2024-12-04
"""
from nltk.tokenize import sent_tokenize
# TODO: consider moving to helpers
def split_into_sentences(input_text: str) -> list[str]:
"""
Splits input text into sentences by newlines
and then tokenizes each paragraph into sentences.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of sentences.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
sentences = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Tokenize the paragraph into sentences
sentences.extend(sent_tokenize(paragraph))
return sentences
def split_into_paragraphs(input_text: str) -> list[str]:
"""
Splits input text into paragraphs based on newline characters.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of paragraphs.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
out_paragraphs = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Append the cleaned paragraph to the output list.
out_paragraphs.append(paragraph)
return out_paragraphs