import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import json
from datetime import datetime
import gradio as gr
import io

nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)

nltk_resources = ["stopwords", "punkt", "wordnet"]
for resource in nltk_resources:
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(resource, download_dir=nltk_data_dir)

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def capitalize_sentences(text):
    sentences = [s.strip().capitalize() for s in text.strip().split('.')]
    return '. '.join([s for s in sentences if s]).strip() + '.'

def process_transcript(csv_file, txt_file):
    transcript = pd.read_csv(csv_file.name)
    #loi_chuan = pd.read_csv(txt_file, sep='\t', header=None)
    loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python')


    #transcript = pd.read_csv(io.StringIO(csv_file.read().decode("utf-8")))
    #loi_chuan = pd.read_csv(io.StringIO(txt_file.read().decode("utf-8")), sep='\t', header=None)

    loi_chuan[0] = loi_chuan[0].astype(str).str.strip()
    loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)]
    loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text)
    loi_chuan = loi_chuan.reset_index(drop=True)

    # Build lyric list
    lyric = ''
    for i in range(len(loi_chuan['cleaned_text'])):
        lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip()
    lyric = lyric.lstrip('.').strip()
    lyric_list  = lyric.split('. ')

    # Clean transcript
    transcript['processed_text'] = ""
    for i in range(len(transcript)):
        transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.')
        for j in range(len(transcript.at[i, 'processed_text'])):
            transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j])

    transcript['Renew_processed_text'] = ''
    for i in range(len(transcript)):
        for j in range(len(transcript['processed_text'][i])):
            transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip()

    transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.')

    start = 0
    end = 0
    transcript['lyric'] = ''
    max_lyric_index = len(lyric_list)

    for i in range(len(transcript)):
        len_transcript = len(transcript['Renew_processed_text'][i].split())
        len_lyric = 0

        while (len_lyric / len_transcript < 1) and (end < max_lyric_index):
            sequence = " ".join(lyric_list[start:end+1])
            len_lyric = len(sequence.split())

            if len_lyric / len_transcript >= 1:
                transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1])
                start = end + 1
                end = start
            else:
                end += 1

    for i in range(len(transcript)):
        if transcript.at[i, 'lyric']:
            transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric'])

    df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']]
    df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text']

    df_final['Text'] = df_final['Text'].astype(str).str.strip()
    df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip()

    output_path = "formatted_transcript.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in df_final.iterrows():
            f.write(f"{row['Start Time']} - {row['End Time']}\n")
            f.write(f"{row['Speaker Name']}\n")
            f.write(f"{row['Text']}\n\n")

    return output_path

demo = gr.Interface(
    fn=process_transcript,
    inputs=[
        gr.File(label="Upload Transcript CSV"),
        gr.File(label="Upload Loi Chuan TXT")
    ],
    outputs=gr.File(label="Download formatted_transcript.txt"),
    title="Transcript Correction to TXT",
    description="Upload your raw transcript and clean lyric file to generate a formatted .txt output."
)

demo.launch()