Spaces:
Running
Running
# appImage.py | |
from transformers import pipeline | |
import tempfile, os | |
from PIL import Image | |
from gtts import gTTS | |
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
async def caption_image(file): | |
contents = await file.read() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: | |
tmp.write(contents) | |
image_path = tmp.name | |
captions = captioner(image_path) | |
caption = captions[0]['generated_text'] if captions else "No caption generated." | |
audio_path = text_to_speech(caption) | |
result = {"caption": caption} | |
if audio_path: | |
result["audioUrl"] = f"/files/{os.path.basename(audio_path)}" | |
return result | |
def text_to_speech(text: str): | |
try: | |
tts = gTTS(text) | |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tts.save(temp_audio.name) | |
return temp_audio.name | |
except: | |
return "" | |