import re from dateutil.parser import parse from num2words import num2words import inflect from ftfy import fix_text # Initialize the inflect engine inflect_engine = inflect.engine() # Define alphabet pronunciation mapping alphabet_map = { "A": " Eh ", "B": " Bee ", "C": " See ", "D": " Dee ", "E": " Eee ", "F": " Eff ", "G": " Jee ", "H": " Aitch ", "I": " Eye ", "J": " Jay ", "K": " Kay ", "L": " El ", "M": " Emm ", "N": " Enn ", "O": " Ohh ", "P": " Pee ", "Q": " Queue ", "R": " Are ", "S": " Ess ", "T": " Tee ", "U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed " } # Function to add ordinal suffix to a number def add_ordinal_suffix(day): """Adds ordinal suffix to a day (e.g., 13 -> 13th).""" if 11 <= day <= 13: # Special case for 11th, 12th, 13th return f"{day}th" elif day % 10 == 1: return f"{day}st" elif day % 10 == 2: return f"{day}nd" elif day % 10 == 3: return f"{day}rd" else: return f"{day}th" # Function to format dates in a human-readable form def format_date(parsed_date, include_time=True): """Formats a parsed date into a human-readable string.""" if not parsed_date: return None # Convert the day into an ordinal (e.g., 13 -> 13th) day = add_ordinal_suffix(parsed_date.day) # Format the date in a TTS-friendly way if include_time and parsed_date.hour != 0 and parsed_date.minute != 0: return parsed_date.strftime(f"%B {day}, %Y at %-I:%M %p") # Unix return parsed_date.strftime(f"%B {day}, %Y") # Only date # Normalize dates in the text def normalize_dates(text): """ Finds and replaces date strings with a nicely formatted, TTS-friendly version. """ def replace_date(match): raw_date = match.group(0) try: parsed_date = parse(raw_date) if parsed_date: include_time = "T" in raw_date or " " in raw_date # Include time only if explicitly provided return format_date(parsed_date, include_time) except ValueError: pass return raw_date # Match common date formats date_pattern = r"\b(\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?|\d{2}/\d{2}/\d{4}|\d{1,2} \w+ \d{4})\b" return re.sub(date_pattern, replace_date, text) # Replace invalid characters and clean text def replace_invalid_chars(string): string = fix_text(string) replacements = { "**": "", ''': "'", 'AI;': 'Artificial Intelligence!', 'iddqd;': 'Immortality cheat code', '😉;': 'wink wink!', ':D': '*laughs* Ahahaha!', ';D': '*laughs* Ahahaha!' } for old, new in replacements.items(): string = string.replace(old, new) return string # Replace numbers with their word equivalents def replace_numbers(string): ipv4_pattern = r'(\b\d{1,3}(\.\d{1,3}){3}\b)' ipv6_pattern = r'([0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}' range_pattern = r'\b\d+-\d+\b' # Detect ranges like 1-4 date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b' alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b' # Do not process IP addresses, date patterns, or alphanumerics if re.search(ipv4_pattern, string) or re.search(ipv6_pattern, string) or re.search(range_pattern, string) or re.search(date_pattern, string) or re.search(alphanumeric_pattern, string): return string # Convert standalone numbers and port numbers def convert_number(match): number = match.group() return num2words(int(number)) if number.isdigit() else number pattern = re.compile(r'\b\d+\b') return re.sub(pattern, convert_number, string) # Replace abbreviations with expanded form def replace_abbreviations(string): words = string.split() for i, word in enumerate(words): if word.isupper() and len(word) <= 4 and not any(char.isdigit() for char in word) and word not in ["ID", "AM", "PM"]: words[i] = ''.join([alphabet_map.get(char, char) for char in word]) return ' '.join(words) # Clean up whitespace in the text def clean_whitespace(string): string = re.sub(r'\s+([.,?!])', r'\1', string) return ' '.join(string.split()) # Main preprocessing pipeline def preprocess_all(string): string = normalize_dates(string) string = replace_invalid_chars(string) string = replace_numbers(string) string = replace_abbreviations(string) string = clean_whitespace(string) return string # Expose a testing function for external use def test_preprocessing(file_path): with open(file_path, 'r') as file: lines = file.readlines() for line in lines: original = line.strip() processed = preprocess_all(original) print(f"Original: {original}") print(f"Processed: {processed}\n") if __name__ == "__main__": import sys if len(sys.argv) > 1: test_file = sys.argv[1] test_preprocessing(test_file) else: print("Please provide a file path as an argument.")