|
import pandas as pd |
|
import re |
|
import string |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
import json |
|
from datetime import datetime |
|
import gradio as gr |
|
import io |
|
|
|
nltk_data_dir = "./nltk_data" |
|
nltk.data.path.append(nltk_data_dir) |
|
|
|
nltk_resources = ["stopwords", "punkt", "wordnet"] |
|
for resource in nltk_resources: |
|
try: |
|
nltk.data.find(resource) |
|
except LookupError: |
|
nltk.download(resource, download_dir=nltk_data_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
|
|
def clean_text(text): |
|
text = text.lower() |
|
text = text.translate(str.maketrans('', '', string.punctuation)) |
|
tokens = nltk.word_tokenize(text) |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
cleaned_text = ' '.join(tokens) |
|
return cleaned_text |
|
|
|
def capitalize_sentences(text): |
|
sentences = [s.strip().capitalize() for s in text.strip().split('.')] |
|
return '. '.join([s for s in sentences if s]).strip() + '.' |
|
|
|
def process_transcript(csv_file, txt_file): |
|
transcript = pd.read_csv(csv_file.name) |
|
|
|
loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python') |
|
|
|
|
|
|
|
|
|
|
|
loi_chuan[0] = loi_chuan[0].astype(str).str.strip() |
|
loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)] |
|
loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text) |
|
loi_chuan = loi_chuan.reset_index(drop=True) |
|
|
|
|
|
lyric = '' |
|
for i in range(len(loi_chuan['cleaned_text'])): |
|
lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip() |
|
lyric = lyric.lstrip('.').strip() |
|
lyric_list = lyric.split('. ') |
|
|
|
|
|
transcript['processed_text'] = "" |
|
for i in range(len(transcript)): |
|
transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.') |
|
for j in range(len(transcript.at[i, 'processed_text'])): |
|
transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j]) |
|
|
|
transcript['Renew_processed_text'] = '' |
|
for i in range(len(transcript)): |
|
for j in range(len(transcript['processed_text'][i])): |
|
transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip() |
|
|
|
transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.') |
|
|
|
start = 0 |
|
end = 0 |
|
transcript['lyric'] = '' |
|
max_lyric_index = len(lyric_list) |
|
|
|
for i in range(len(transcript)): |
|
len_transcript = len(transcript['Renew_processed_text'][i].split()) |
|
len_lyric = 0 |
|
|
|
while (len_lyric / len_transcript < 1) and (end < max_lyric_index): |
|
sequence = " ".join(lyric_list[start:end+1]) |
|
len_lyric = len(sequence.split()) |
|
|
|
if len_lyric / len_transcript >= 1: |
|
transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1]) |
|
start = end + 1 |
|
end = start |
|
else: |
|
end += 1 |
|
|
|
for i in range(len(transcript)): |
|
if transcript.at[i, 'lyric']: |
|
transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric']) |
|
|
|
df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']] |
|
df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text'] |
|
|
|
df_final['Text'] = df_final['Text'].astype(str).str.strip() |
|
df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip() |
|
|
|
output_path = "formatted_transcript.txt" |
|
with open(output_path, "w", encoding="utf-8") as f: |
|
for _, row in df_final.iterrows(): |
|
f.write(f"{row['Start Time']} - {row['End Time']}\n") |
|
f.write(f"{row['Speaker Name']}\n") |
|
f.write(f"{row['Text']}\n\n") |
|
|
|
return output_path |
|
|
|
demo = gr.Interface( |
|
fn=process_transcript, |
|
inputs=[ |
|
gr.File(label="Upload Transcript CSV"), |
|
gr.File(label="Upload Loi Chuan TXT") |
|
], |
|
outputs=gr.File(label="Download formatted_transcript.txt"), |
|
title="Transcript Correction to TXT", |
|
description="Upload your raw transcript and clean lyric file to generate a formatted .txt output." |
|
) |
|
|
|
demo.launch() |
|
|