File size: 4,598 Bytes
2ae7cfe 958a408 f7a33e0 2ae7cfe 1780683 be8812b 1780683 2ae7cfe e72d5bf 2ae7cfe 044d4bc 1780683 f7a33e0 1c13c85 2ae7cfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json
from datetime import datetime
import gradio as gr
import io
nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)
nltk_resources = ["stopwords", "punkt", "wordnet"]
for resource in nltk_resources:
try:
nltk.data.find(resource)
except LookupError:
nltk.download(resource, download_dir=nltk_data_dir)
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = nltk.word_tokenize(text)
tokens = [lemmatizer.lemmatize(word) for word in tokens]
cleaned_text = ' '.join(tokens)
return cleaned_text
def capitalize_sentences(text):
sentences = [s.strip().capitalize() for s in text.strip().split('.')]
return '. '.join([s for s in sentences if s]).strip() + '.'
def process_transcript(csv_file, txt_file):
transcript = pd.read_csv(csv_file.name)
#loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')
#transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
#loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)
loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
loi_chuan = loi_chuan.reset_index(drop=True)
# Build lyric list
lyric = ''
for i in range(len(loi_chuan['cleaned_text'])):
lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
lyric = lyric.lstrip('.').strip()
lyric_list = lyric.split('. ')
# Clean transcript
transcript['processed_text'] = ""
for i in range(len(transcript)):
transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
for j in range(len(transcript.at[i, 'processed_text'])):
transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])
transcript['Renew_processed_text'] = ''
for i in range(len(transcript)):
for j in range(len(transcript['processed_text'][i])):
transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()
transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')
start = 0
end = 0
transcript['lyric'] = ''
max_lyric_index = len(lyric_list)
for i in range(len(transcript)):
len_transcript = len(transcript['Renew_processed_text'][i].split())
len_lyric = 0
while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
sequence = " ".join(lyric_list[start:end+1])
len_lyric = len(sequence.split())
if len_lyric / len_transcript >= 1:
transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
start = end + 1
end = start
else:
end += 1
for i in range(len(transcript)):
if transcript.at[i, 'lyric']:
transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])
df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']
df_final['Text'] = df_final['Text'].astype(str).str.strip()
df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()
output_path = "formatted_transcript.txt"
with open(output_path, "w", encoding="utf-8") as f:
for _, row in df_final.iterrows():
f.write(f"{row['Start Time']} - {row['End Time']}\n")
f.write(f"{row['Speaker Name']}\n")
f.write(f"{row['Text']}\n\n")
return output_path
demo = gr.Interface(
fn=process_transcript,
inputs=[
gr.File(label="Upload Transcript CSV"),
gr.File(label="Upload Loi Chuan TXT")
],
outputs=gr.File(label="Download formatted_transcript.txt"),
title="Transcript Correction to TXT",
description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
)
demo.launch()
|