File size: 1,868 Bytes
0827f9d
 
 
 
 
1ce1659
 
38fd181
0827f9d
 
1ce1659
0827f9d
 
1ce1659
 
0827f9d
1ce1659
 
0827f9d
 
1ce1659
 
 
 
0827f9d
 
56cf7e3
1ce1659
 
0827f9d
1ce1659
0827f9d
38fd181
0827f9d
1ce1659
0827f9d
38fd181
a5e8d12
 
0827f9d
a5e8d12
0827f9d
a5e8d12
 
0827f9d
a5e8d12
 
0827f9d
 
a5e8d12
 
 
 
0827f9d
 
a5e8d12
 
0827f9d
a5e8d12
0827f9d
a5e8d12
0827f9d
a5e8d12
0827f9d
a5e8d12
0827f9d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Author: Khanh Phan
Date: 2024-12-04
"""

from nltk.tokenize import sent_tokenize


# TODO: consider moving to helpers
def split_into_sentences(input_text: str) -> list[str]:
    """
    Splits input text into sentences by newlines
        and then tokenizes each paragraph into sentences.

    Args:
        input_text (str): The input text as a string.

    Returns:
        list: A list of sentences.
            Returns an empty list if input is not a string.
    """
    if not isinstance(input_text, str):
        return []

    # Split the input text into paragraphs based on newline characters,
    # keeping the newline characters.
    paragraphs = input_text.splitlines(keepends=True)
    sentences = []
    for paragraph in paragraphs:
        # Remove leading/trailing whitespace
        paragraph = paragraph.strip()

        if paragraph and paragraph != "\n":
            # Tokenize the paragraph into sentences
            sentences.extend(sent_tokenize(paragraph))

    return sentences


def split_into_paragraphs(input_text: str) -> list[str]:
    """
    Splits input text into paragraphs based on newline characters.

    Args:
        input_text (str): The input text as a string.

    Returns:
        list: A list of paragraphs.
            Returns an empty list if input is not a string.
    """
    if not isinstance(input_text, str):
        return []

    # Split the input text into paragraphs based on newline characters,
    # keeping the newline characters.
    paragraphs = input_text.splitlines(keepends=True)
    out_paragraphs = []

    for paragraph in paragraphs:
        # Remove leading/trailing whitespace
        paragraph = paragraph.strip()

        if paragraph and paragraph != "\n":
            # Append the cleaned paragraph to the output list.
            out_paragraphs.append(paragraph)

    return out_paragraphs