""" Author: Khanh Phan Date: 2024-12-04 """ from nltk.tokenize import sent_tokenize # TODO: consider moving to helpers def split_into_sentences(input_text: str) -> list[str]: """ Splits input text into sentences by newlines and then tokenizes each paragraph into sentences. Args: input_text (str): The input text as a string. Returns: list: A list of sentences. Returns an empty list if input is not a string. """ if not isinstance(input_text, str): return [] # Split the input text into paragraphs based on newline characters, # keeping the newline characters. paragraphs = input_text.splitlines(keepends=True) sentences = [] for paragraph in paragraphs: # Remove leading/trailing whitespace paragraph = paragraph.strip() if paragraph and paragraph != "\n": # Tokenize the paragraph into sentences sentences.extend(sent_tokenize(paragraph)) return sentences def split_into_paragraphs(input_text: str) -> list[str]: """ Splits input text into paragraphs based on newline characters. Args: input_text (str): The input text as a string. Returns: list: A list of paragraphs. Returns an empty list if input is not a string. """ if not isinstance(input_text, str): return [] # Split the input text into paragraphs based on newline characters, # keeping the newline characters. paragraphs = input_text.splitlines(keepends=True) out_paragraphs = [] for paragraph in paragraphs: # Remove leading/trailing whitespace paragraph = paragraph.strip() if paragraph and paragraph != "\n": # Append the cleaned paragraph to the output list. out_paragraphs.append(paragraph) return out_paragraphs