import torch
import gradio as gr
import whisper
from gtts import gTTS
from pydub import AudioSegment
import tempfile
import os
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

# Load Whisper model
whisper_model = whisper.load_model("base")

# Load mBART
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Target language
TARGET_LANG = "hi_IN"  # Hindi

def respond(prompt_text, audio_file):
    transcription = None
    try:
        if prompt_text and prompt_text.strip():
            final_prompt = prompt_text.strip()
        elif audio_file:
            sound = AudioSegment.from_file(audio_file)
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpwav:
                sound.export(tmpwav.name, format="wav")
                transcription = whisper_model.transcribe(tmpwav.name)["text"]
                final_prompt = transcription
        else:
            return "No prompt provided", "", None

        # Generate response
        tokenizer.src_lang = "en_XX"
        encoded = tokenizer(final_prompt, return_tensors="pt").to(model.device)
        generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANG], max_new_tokens=100)
        translated = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

        # TTS
        tts = gTTS(translated, lang='hi')
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts.save(fp.name)
            audio_path = fp.name

        return transcription if transcription else "Typed input used", translated, audio_path

    except Exception as e:
        return f"Error: {str(e)}", "", None

with gr.Blocks(theme=gr.themes.Soft(), title="Chat with Vidhya") as demo:
    gr.Markdown("""
        # 🧠 Chat with Vidhya
        **An AI assistant that listens to your voice or reads your text, and responds in your language.**
    """)

    with gr.Row():
        txt_input = gr.Textbox(lines=2, label="Type your prompt (optional)")
        audio_input = gr.Audio(type="filepath", label="Or speak your prompt")

    with gr.Row():
        transcription_output = gr.Textbox(label="Transcribed Speech")
        text_output = gr.Textbox(label="Model's Response")
        audio_output = gr.Audio(type="filepath", label="Spoken Response")

    submit_btn = gr.Button("Submit")
    submit_btn.click(fn=respond, inputs=[txt_input, audio_input], outputs=[transcription_output, text_output, audio_output])

demo.launch()