File size: 4,598 Bytes
2ae7cfe
 
 
 
 
 
 
 
 
958a408
f7a33e0
2ae7cfe
1780683
 
be8812b
1780683
 
 
 
 
 
 
 
 
 
 
2ae7cfe
 
 
 
e72d5bf
 
2ae7cfe
 
 
 
 
 
 
 
 
 
 
 
 
044d4bc
1780683
 
 
f7a33e0
1c13c85
 
2ae7cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import json
from datetime import datetime
import gradio as gr
import io

nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)

nltk_resources = ["stopwords", "punkt", "wordnet"]
for resource in nltk_resources:
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(resource, download_dir=nltk_data_dir)

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def capitalize_sentences(text):
    sentences = [s.strip().capitalize() for s in text.strip().split('.')]
    return '. '.join([s for s in sentences if s]).strip() + '.'

def process_transcript(csv_file, txt_file):
    transcript = pd.read_csv(csv_file.name)
    #loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
    loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')


    #transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
    #loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)

    loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
    loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
    loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
    loi_chuan = loi_chuan.reset_index(drop=True)

    # Build lyric list
    lyric = ''
    for i in range(len(loi_chuan['cleaned_text'])):
        lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
    lyric = lyric.lstrip('.').strip()
    lyric_list  = lyric.split('. ')

    # Clean transcript
    transcript['processed_text'] = ""
    for i in range(len(transcript)):
        transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
        for j in range(len(transcript.at[i, 'processed_text'])):
            transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])

    transcript['Renew_processed_text'] = ''
    for i in range(len(transcript)):
        for j in range(len(transcript['processed_text'][i])):
            transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()

    transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')

    start = 0
    end = 0
    transcript['lyric'] = ''
    max_lyric_index = len(lyric_list)

    for i in range(len(transcript)):
        len_transcript = len(transcript['Renew_processed_text'][i].split())
        len_lyric = 0

        while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
            sequence = " ".join(lyric_list[start:end+1])
            len_lyric = len(sequence.split())

            if len_lyric / len_transcript >= 1:
                transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
                start = end + 1
                end = start
            else:
                end += 1

    for i in range(len(transcript)):
        if transcript.at[i, 'lyric']:
            transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])

    df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
    df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']

    df_final['Text'] = df_final['Text'].astype(str).str.strip()
    df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()

    output_path = "formatted_transcript.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in df_final.iterrows():
            f.write(f"{row['Start Time']} - {row['End Time']}\n")
            f.write(f"{row['Speaker Name']}\n")
            f.write(f"{row['Text']}\n\n")

    return output_path

demo = gr.Interface(
    fn=process_transcript,
    inputs=[
        gr.File(label="Upload Transcript CSV"),
        gr.File(label="Upload Loi Chuan TXT")
    ],
    outputs=gr.File(label="Download formatted_transcript.txt"),
    title="Transcript Correction to TXT",
    description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
)

demo.launch()