BGMedia / app.py
thamnt's picture
Update app.py
044d4bc verified
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json
from datetime import datetime
import gradio as gr
import io
nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)
nltk_resources = ["stopwords", "punkt", "wordnet"]
for resource in nltk_resources:
try:
nltk.data.find(resource)
except LookupError:
nltk.download(resource, download_dir=nltk_data_dir)
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = nltk.word_tokenize(text)
tokens = [lemmatizer.lemmatize(word) for word in tokens]
cleaned_text = ' '.join(tokens)
return cleaned_text
def capitalize_sentences(text):
sentences = [s.strip().capitalize() for s in text.strip().split('.')]
return '. '.join([s for s in sentences if s]).strip() + '.'
def process_transcript(csv_file, txt_file):
transcript = pd.read_csv(csv_file.name)
#loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')
#transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
#loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)
loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
loi_chuan = loi_chuan.reset_index(drop=True)
# Build lyric list
lyric = ''
for i in range(len(loi_chuan['cleaned_text'])):
lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
lyric = lyric.lstrip('.').strip()
lyric_list = lyric.split('. ')
# Clean transcript
transcript['processed_text'] = ""
for i in range(len(transcript)):
transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
for j in range(len(transcript.at[i, 'processed_text'])):
transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])
transcript['Renew_processed_text'] = ''
for i in range(len(transcript)):
for j in range(len(transcript['processed_text'][i])):
transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()
transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')
start = 0
end = 0
transcript['lyric'] = ''
max_lyric_index = len(lyric_list)
for i in range(len(transcript)):
len_transcript = len(transcript['Renew_processed_text'][i].split())
len_lyric = 0
while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
sequence = " ".join(lyric_list[start:end+1])
len_lyric = len(sequence.split())
if len_lyric / len_transcript >= 1:
transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
start = end + 1
end = start
else:
end += 1
for i in range(len(transcript)):
if transcript.at[i, 'lyric']:
transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])
df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']
df_final['Text'] = df_final['Text'].astype(str).str.strip()
df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()
output_path = "formatted_transcript.txt"
with open(output_path, "w", encoding="utf-8") as f:
for _, row in df_final.iterrows():
f.write(f"{row['Start Time']} - {row['End Time']}\n")
f.write(f"{row['Speaker Name']}\n")
f.write(f"{row['Text']}\n\n")
return output_path
demo = gr.Interface(
fn=process_transcript,
inputs=[
gr.File(label="Upload Transcript CSV"),
gr.File(label="Upload Loi Chuan TXT")
],
outputs=gr.File(label="Download formatted_transcript.txt"),
title="Transcript Correction to TXT",
description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
)
demo.launch()