# appImage.py
from transformers import pipeline
import tempfile, os
from PIL import Image
from gtts import gTTS

captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

async def caption_image(file):
    contents = await file.read()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        tmp.write(contents)
        image_path = tmp.name

    captions = captioner(image_path)
    caption = captions[0]['generated_text'] if captions else "No caption generated."

    audio_path = text_to_speech(caption)

    result = {"caption": caption}
    if audio_path:
        result["audioUrl"] = f"/files/{os.path.basename(audio_path)}"
    return result

def text_to_speech(text: str):
    try:
        tts = gTTS(text)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)
        return temp_audio.name
    except:
        return ""