Spaces:

thamnt
/

BGMedia

Sleeping

App Files Files Community

BGMedia / app.py

thamnt

Update app.py

044d4bc verified 14 days ago

raw

history blame contribute delete

4.6 kB

	import pandas as pd
	import re
	import string
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer

	import json
	from datetime import datetime
	import gradio as gr
	import io

	nltk_data_dir = "./nltk_data"
	nltk.data.path.append(nltk_data_dir)

	nltk_resources = ["stopwords", "punkt", "wordnet"]
	for resource in nltk_resources:
	try:
	nltk.data.find(resource)
	except LookupError:
	nltk.download(resource, download_dir=nltk_data_dir)

	#nltk.download('stopwords')
	#nltk.download('punkt')
	#nltk.download('wordnet')
	#nltk.download('punkt_tab')

	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()



	def clean_text(text):
	text = text.lower()
	text = text.translate(str.maketrans('', '', string.punctuation))
	tokens = nltk.word_tokenize(text)
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	cleaned_text = ' '.join(tokens)
	return cleaned_text

	def capitalize_sentences(text):
	sentences = [s.strip().capitalize() for s in text.strip().split('.')]
	return '. '.join([s for s in sentences if s]).strip() + '.'

	def process_transcript(csv_file, txt_file):
	transcript = pd.read_csv(csv_file.name)
	#loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
	loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')


	#transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
	#loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)

	loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
	loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
	loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
	loi_chuan = loi_chuan.reset_index(drop=True)

	# Build lyric list
	lyric = ''
	for i in range(len(loi_chuan['cleaned_text'])):
	lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
	lyric = lyric.lstrip('.').strip()
	lyric_list = lyric.split('. ')

	# Clean transcript
	transcript['processed_text'] = ""
	for i in range(len(transcript)):
	transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
	for j in range(len(transcript.at[i, 'processed_text'])):
	transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])

	transcript['Renew_processed_text'] = ''
	for i in range(len(transcript)):
	for j in range(len(transcript['processed_text'][i])):
	transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()

	transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')

	start = 0
	end = 0
	transcript['lyric'] = ''
	max_lyric_index = len(lyric_list)

	for i in range(len(transcript)):
	len_transcript = len(transcript['Renew_processed_text'][i].split())
	len_lyric = 0

	while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
	sequence = " ".join(lyric_list[start:end+1])
	len_lyric = len(sequence.split())

	if len_lyric / len_transcript >= 1:
	transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
	start = end + 1
	end = start
	else:
	end += 1

	for i in range(len(transcript)):
	if transcript.at[i, 'lyric']:
	transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])

	df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
	df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']

	df_final['Text'] = df_final['Text'].astype(str).str.strip()
	df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()

	output_path = "formatted_transcript.txt"
	with open(output_path, "w", encoding="utf-8") as f:
	for _, row in df_final.iterrows():
	f.write(f"{row['Start Time']} - {row['End Time']}\n")
	f.write(f"{row['Speaker Name']}\n")
	f.write(f"{row['Text']}\n\n")

	return output_path

	demo = gr.Interface(
	fn=process_transcript,
	inputs=[
	gr.File(label="Upload Transcript CSV"),
	gr.File(label="Upload Loi Chuan TXT")
	],
	outputs=gr.File(label="Download formatted_transcript.txt"),
	title="Transcript Correction to TXT",
	description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
	)

	demo.launch()