Spaces:

thamnt
/

BGMedia

Sleeping

File size: 4,598 Bytes

2ae7cfe
 
 
 
 
 
 
 
 
958a408
f7a33e0
2ae7cfe
1780683
 
be8812b
1780683
 
 
 
 
 
 
 
 
 
 
2ae7cfe
 
 
 
e72d5bf
 
2ae7cfe
 
 
 
 
 
 
 
 
 
 
 
 
044d4bc
1780683
 
 
f7a33e0
1c13c85
 
2ae7cfe

import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import json
from datetime import datetime
import gradio as gr
import io

nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)

nltk_resources = ["stopwords", "punkt", "wordnet"]
for resource in nltk_resources:
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(resource, download_dir=nltk_data_dir)

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def capitalize_sentences(text):
    sentences = [s.strip().capitalize() for s in text.strip().split('.')]
    return '. '.join([s for s in sentences if s]).strip() + '.'

def process_transcript(csv_file, txt_file):
    transcript = pd.read_csv(csv_file.name)
    #loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
    loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')


    #transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
    #loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)

    loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
    loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
    loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
    loi_chuan = loi_chuan.reset_index(drop=True)

    # Build lyric list
    lyric = ''
    for i in range(len(loi_chuan['cleaned_text'])):
        lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
    lyric = lyric.lstrip('.').strip()
    lyric_list  = lyric.split('. ')

    # Clean transcript
    transcript['processed_text'] = ""
    for i in range(len(transcript)):
        transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
        for j in range(len(transcript.at[i, 'processed_text'])):
            transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])

    transcript['Renew_processed_text'] = ''
    for i in range(len(transcript)):
        for j in range(len(transcript['processed_text'][i])):
            transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()

    transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')

    start = 0
    end = 0
    transcript['lyric'] = ''
    max_lyric_index = len(lyric_list)

    for i in range(len(transcript)):
        len_transcript = len(transcript['Renew_processed_text'][i].split())
        len_lyric = 0

        while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
            sequence = " ".join(lyric_list[start:end+1])
            len_lyric = len(sequence.split())

            if len_lyric / len_transcript >= 1:
                transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
                start = end + 1
                end = start
            else:
                end += 1

    for i in range(len(transcript)):
        if transcript.at[i, 'lyric']:
            transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])

    df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
    df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']

    df_final['Text'] = df_final['Text'].astype(str).str.strip()
    df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()

    output_path = "formatted_transcript.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in df_final.iterrows():
            f.write(f"{row['Start Time']} - {row['End Time']}\n")
            f.write(f"{row['Speaker Name']}\n")
            f.write(f"{row['Text']}\n\n")

    return output_path

demo = gr.Interface(
    fn=process_transcript,
    inputs=[
        gr.File(label="Upload Transcript CSV"),
        gr.File(label="Upload Loi Chuan TXT")
    ],
    outputs=gr.File(label="Download formatted_transcript.txt"),
    title="Transcript Correction to TXT",
    description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
)

demo.launch()