# appImage.py from transformers import pipeline import tempfile, os from PIL import Image from gtts import gTTS captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") async def caption_image(file): contents = await file.read() with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: tmp.write(contents) image_path = tmp.name captions = captioner(image_path) caption = captions[0]['generated_text'] if captions else "No caption generated." audio_path = text_to_speech(caption) result = {"caption": caption} if audio_path: result["audioUrl"] = f"/files/{os.path.basename(audio_path)}" return result def text_to_speech(text: str): try: tts = gTTS(text) temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_audio.name) return temp_audio.name except: return ""