File size: 3,032 Bytes
85abd3d
32dd4d2
7cab805
d5d3aa6
32dd4d2
d5d3aa6
 
 
 
32dd4d2
d5d3aa6
 
32dd4d2
d5d3aa6
32dd4d2
 
d5d3aa6
 
 
7cab805
d5d3aa6
 
7cab805
d5d3aa6
32dd4d2
7cab805
d5d3aa6
 
7cab805
d5d3aa6
32dd4d2
d5d3aa6
32dd4d2
 
 
 
 
 
d5d3aa6
32dd4d2
 
d5d3aa6
7cab805
32dd4d2
85abd3d
8d67b19
6852c86
8d67b19
85abd3d
8d67b19
6852c86
85abd3d
6852c86
 
 
 
 
 
 
85abd3d
8d67b19
 
 
 
 
 
6852c86
 
 
cc49774
6852c86
 
 
 
8d67b19
 
 
 
 
6852c86
8d67b19
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""from fastapi import FastAPI, UploadFile, File
from fastapi.responses import RedirectResponse, JSONResponse
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import tempfile
import torch

app = FastAPI()

# Load model
try:
    processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
    model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
    USE_GIT = True
except Exception:
    from transformers import pipeline
    captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    USE_GIT = False

def generate_caption(image_path):
    try:
        if USE_GIT:
            image = Image.open(image_path)
            inputs = processor(images=image, return_tensors="pt")
            outputs = model.generate(**inputs, max_length=50)
            return processor.batch_decode(outputs, skip_special_tokens=True)[0]
        else:
            result = captioner(image_path)
            return result[0]['generated_text']
    except Exception as e:
        return f"Error generating caption: {str(e)}"

@app.post("/imagecaption/")
async def caption_from_frontend(file: UploadFile = File(...)):
    contents = await file.read()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        tmp.write(contents)
        image_path = tmp.name

    caption = generate_caption(image_path)
    return JSONResponse({"caption": caption})

@app.get("/")
def home():
    return RedirectResponse(url="/")"""
# appImage.py
from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
import tempfile, os
from PIL import Image
from gtts import gTTS
import torch

try:
    processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
    model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
    USE_GIT = True
except Exception:
    captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    USE_GIT = False

async def caption_image(file):
    contents = await file.read()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        tmp.write(contents)
        image_path = tmp.name

    if USE_GIT:
        image = Image.open(image_path).convert('RGB')
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values, max_length=500)
        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    else:
        captions = captioner(image_path)
        caption = captions[0]['generated_text'] if captions else "No caption generated."

    audio_path = text_to_speech(caption)

    result = {"caption": caption}
    if audio_path:
        result["audio"] = f"/files/{os.path.basename(audio_path)}"
    return result

def text_to_speech(text: str):
    try:
        tts = gTTS(text)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)
        return temp_audio.name
    except:
        return ""