Spaces:
Sleeping
Sleeping
File size: 1,868 Bytes
0827f9d 1ce1659 38fd181 0827f9d 1ce1659 0827f9d 1ce1659 0827f9d 1ce1659 0827f9d 1ce1659 0827f9d 56cf7e3 1ce1659 0827f9d 1ce1659 0827f9d 38fd181 0827f9d 1ce1659 0827f9d 38fd181 a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d a5e8d12 0827f9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
"""
Author: Khanh Phan
Date: 2024-12-04
"""
from nltk.tokenize import sent_tokenize
# TODO: consider moving to helpers
def split_into_sentences(input_text: str) -> list[str]:
"""
Splits input text into sentences by newlines
and then tokenizes each paragraph into sentences.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of sentences.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
sentences = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Tokenize the paragraph into sentences
sentences.extend(sent_tokenize(paragraph))
return sentences
def split_into_paragraphs(input_text: str) -> list[str]:
"""
Splits input text into paragraphs based on newline characters.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of paragraphs.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
out_paragraphs = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Append the cleaned paragraph to the output list.
out_paragraphs.append(paragraph)
return out_paragraphs
|