File size: 5,089 Bytes
db0a2ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re
from dateutil.parser import parse
from num2words import num2words
import inflect
from ftfy import fix_text

# Initialize the inflect engine
inflect_engine = inflect.engine()

# Define alphabet pronunciation mapping
alphabet_map = {
    "A": " Eh ", "B": " Bee ", "C": " See ", "D": " Dee ", "E": " Eee ",
    "F": " Eff ", "G": " Jee ", "H": " Aitch ", "I": " Eye ", "J": " Jay ",
    "K": " Kay ", "L": " El ", "M": " Emm ", "N": " Enn ", "O": " Ohh ",
    "P": " Pee ", "Q": " Queue ", "R": " Are ", "S": " Ess ", "T": " Tee ",
    "U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed "
}

# Function to add ordinal suffix to a number
def add_ordinal_suffix(day):
    """Adds ordinal suffix to a day (e.g., 13 -> 13th)."""
    if 11 <= day <= 13:  # Special case for 11th, 12th, 13th
        return f"{day}th"
    elif day % 10 == 1:
        return f"{day}st"
    elif day % 10 == 2:
        return f"{day}nd"
    elif day % 10 == 3:
        return f"{day}rd"
    else:
        return f"{day}th"

# Function to format dates in a human-readable form
def format_date(parsed_date, include_time=True):
    """Formats a parsed date into a human-readable string."""
    if not parsed_date:
        return None

    # Convert the day into an ordinal (e.g., 13 -> 13th)
    day = add_ordinal_suffix(parsed_date.day)

    # Format the date in a TTS-friendly way
    if include_time and parsed_date.hour != 0 and parsed_date.minute != 0:
        return parsed_date.strftime(f"%B {day}, %Y at %-I:%M %p")  # Unix
    return parsed_date.strftime(f"%B {day}, %Y")  # Only date

# Normalize dates in the text
def normalize_dates(text):
    """
    Finds and replaces date strings with a nicely formatted, TTS-friendly version.
    """
    def replace_date(match):
        raw_date = match.group(0)
        try:
            parsed_date = parse(raw_date)
            if parsed_date:
                include_time = "T" in raw_date or " " in raw_date  # Include time only if explicitly provided
                return format_date(parsed_date, include_time)
        except ValueError:
            pass
        return raw_date

    # Match common date formats
    date_pattern = r"\b(\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?|\d{2}/\d{2}/\d{4}|\d{1,2} \w+ \d{4})\b"
    return re.sub(date_pattern, replace_date, text)

# Replace invalid characters and clean text
def replace_invalid_chars(string):
    string = fix_text(string)
    replacements = {
        "**": "",
        '&#x27;': "'",
        'AI;': 'Artificial Intelligence!',
        'iddqd;': 'Immortality cheat code',
        '😉;': 'wink wink!',
        ':D': '*laughs* Ahahaha!',
        ';D': '*laughs* Ahahaha!'
    }
    for old, new in replacements.items():
        string = string.replace(old, new)
    return string

# Replace numbers with their word equivalents
def replace_numbers(string):
    ipv4_pattern = r'(\b\d{1,3}(\.\d{1,3}){3}\b)'
    ipv6_pattern = r'([0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}'
    range_pattern = r'\b\d+-\d+\b'  # Detect ranges like 1-4
    date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b'
    alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b'

    # Do not process IP addresses, date patterns, or alphanumerics
    if re.search(ipv4_pattern, string) or re.search(ipv6_pattern, string) or re.search(range_pattern, string) or re.search(date_pattern, string) or re.search(alphanumeric_pattern, string):
        return string

    # Convert standalone numbers and port numbers
    def convert_number(match):
        number = match.group()
        return num2words(int(number)) if number.isdigit() else number

    pattern = re.compile(r'\b\d+\b')
    return re.sub(pattern, convert_number, string)

# Replace abbreviations with expanded form
def replace_abbreviations(string):
    words = string.split()
    for i, word in enumerate(words):
        if word.isupper() and len(word) <= 4 and not any(char.isdigit() for char in word) and word not in ["ID", "AM", "PM"]:
            words[i] = ''.join([alphabet_map.get(char, char) for char in word])
    return ' '.join(words)

# Clean up whitespace in the text
def clean_whitespace(string):
    string = re.sub(r'\s+([.,?!])', r'\1', string)
    return ' '.join(string.split())

# Main preprocessing pipeline
def preprocess_all(string):
    string = normalize_dates(string)
    string = replace_invalid_chars(string)
    string = replace_numbers(string)
    string = replace_abbreviations(string)
    string = clean_whitespace(string)
    return string

# Expose a testing function for external use
def test_preprocessing(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        original = line.strip()
        processed = preprocess_all(original)
        print(f"Original: {original}")
        print(f"Processed: {processed}\n")

if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1:
        test_file = sys.argv[1]
        test_preprocessing(test_file)
    else:
        print("Please provide a file path as an argument.")