qtAnswering / appImage.py
ikraamkb's picture
Update appImage.py
297dd8a verified
raw
history blame
940 Bytes
# appImage.py
from transformers import pipeline
import tempfile, os
from PIL import Image
from gtts import gTTS
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
async def caption_image(file):
contents = await file.read()
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
tmp.write(contents)
image_path = tmp.name
captions = captioner(image_path)
caption = captions[0]['generated_text'] if captions else "No caption generated."
audio_path = text_to_speech(caption)
result = {"caption": caption}
if audio_path:
result["audioUrl"] = f"/files/{os.path.basename(audio_path)}"
return result
def text_to_speech(text: str):
try:
tts = gTTS(text)
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_audio.name)
return temp_audio.name
except:
return ""