import pandas as pd import re import string import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import json from datetime import datetime import gradio as gr import io nltk_data_dir = "./nltk_data" nltk.data.path.append(nltk_data_dir) nltk_resources = ["stopwords", "punkt", "wordnet"] for resource in nltk_resources: try: nltk.data.find(resource) except LookupError: nltk.download(resource, download_dir=nltk_data_dir) #nltk.download('stopwords') #nltk.download('punkt') #nltk.download('wordnet') #nltk.download('punkt_tab') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def clean_text(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) tokens = nltk.word_tokenize(text) tokens = [lemmatizer.lemmatize(word) for word in tokens] cleaned_text = ' '.join(tokens) return cleaned_text def capitalize_sentences(text): sentences = [s.strip().capitalize() for s in text.strip().split('.')] return '. '.join([s for s in sentences if s]).strip() + '.' def process_transcript(csv_file, txt_file): transcript = pd.read_csv(csv_file.name) #loi_chuan = pd.read_csv(txt_file, sep='\t', header=None) loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python') #transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8"))) #loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None) loi_chuan[0] = loi_chuan[0].astype(str).str.strip() loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)] loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text) loi_chuan = loi_chuan.reset_index(drop=True) # Build lyric list lyric = '' for i in range(len(loi_chuan['cleaned_text'])): lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip() lyric = lyric.lstrip('.').strip() lyric_list = lyric.split('. ') # Clean transcript transcript['processed_text'] = "" for i in range(len(transcript)): transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.') for j in range(len(transcript.at[i, 'processed_text'])): transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j]) transcript['Renew_processed_text'] = '' for i in range(len(transcript)): for j in range(len(transcript['processed_text'][i])): transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip() transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.') start = 0 end = 0 transcript['lyric'] = '' max_lyric_index = len(lyric_list) for i in range(len(transcript)): len_transcript = len(transcript['Renew_processed_text'][i].split()) len_lyric = 0 while (len_lyric / len_transcript < 1) and (end < max_lyric_index): sequence = " ".join(lyric_list[start:end+1]) len_lyric = len(sequence.split()) if len_lyric / len_transcript >= 1: transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1]) start = end + 1 end = start else: end += 1 for i in range(len(transcript)): if transcript.at[i, 'lyric']: transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric']) df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']] df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text'] df_final['Text'] = df_final['Text'].astype(str).str.strip() df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip() output_path = "formatted_transcript.txt" with open(output_path, "w", encoding="utf-8") as f: for _, row in df_final.iterrows(): f.write(f"{row['Start Time']} - {row['End Time']}\n") f.write(f"{row['Speaker Name']}\n") f.write(f"{row['Text']}\n\n") return output_path demo = gr.Interface( fn=process_transcript, inputs=[ gr.File(label="Upload Transcript CSV"), gr.File(label="Upload Loi Chuan TXT") ], outputs=gr.File(label="Download formatted_transcript.txt"), title="Transcript Correction to TXT", description="Upload your raw transcript and clean lyric file to generate a formatted .txt output." ) demo.launch()