File size: 4,060 Bytes
f23d324
0000b07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f23d324
ba47cad
f23d324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba47cad
7abb419
 
c255de1
ba47cad
 
 
 
 
 
 
 
c255de1
ba47cad
 
 
 
 
 
c255de1
ba47cad
 
7abb419
 
ba47cad
7abb419
ba47cad
7abb419
ba47cad
 
 
 
 
 
 
7abb419
 
ba47cad
7abb419
c255de1
7abb419
ba47cad
c255de1
7abb419
ba47cad
 
 
 
 
 
 
 
 
7abb419
c255de1
ba47cad
7abb419
c255de1
 
 
b8dc4cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from transformers import pipeline, ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
from PIL import Image
import torch
import fitz  # PyMuPDF for PDF
app = FastAPI()

# ========== Image QA Setup ==========
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
def answer_question_from_image(image, question):
    if image is None or not question.strip():
        return "Please upload an image and ask a question."
    inputs = vqa_processor(image, question, return_tensors="pt")
    with torch.no_grad():
        outputs = vqa_model(**inputs)
    predicted_id = outputs.logits.argmax(-1).item()
    return vqa_model.config.id2label[predicted_id]
# ========== Gradio Interfaces ==========

img_interface = gr.Interface(
    fn=answer_question_from_image,
    inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Ask a Question")],
    outputs="text",
    title="Image Question Answering"
)
# ========== Combine and Mount ==========
demo = gr.TabbedInterface( img_interface , "Image QA")
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/")
def root():
    return RedirectResponse(url="/") """
"""from transformers import ViltProcessor, ViltForQuestionAnswering
import torch

# Load image QA model once
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

def answer_question_from_image(image, question):
    if image is None or not question.strip():
        return "Please upload an image and ask a question."
    
    inputs = vqa_processor(image, question, return_tensors="pt")
    with torch.no_grad():
        outputs = vqa_model(**inputs)
    
    predicted_id = outputs.logits.argmax(-1).item()
    return vqa_model.config.id2label[predicted_id]"""
### βœ… appImage.py β€” Image QA Backend (Cleaned)
from fastapi import FastAPI
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
import os
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
from gtts import gTTS
import easyocr
import torch
import tempfile
import numpy as np
from io import BytesIO

app = FastAPI()

vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
reader = easyocr.Reader(['en', 'fr'])

def classify_question(question: str):
    q = question.lower()
    if any(w in q for w in ["text", "say", "written", "read"]):
        return "ocr"
    if any(w in q for w in ["caption", "describe", "what is in the image"]):
        return "caption"
    return "vqa"

def answer_question_from_image(image, question):
    if image is None or not question.strip():
        return "Please upload an image and ask a question.", None

    mode = classify_question(question)

    try:
        if mode == "ocr":
            result = reader.readtext(np.array(image))
            answer = " ".join([entry[1] for entry in result]) or "No readable text found."

        elif mode == "caption":
            answer = captioner(image)[0]['generated_text']

        else:
            inputs = vqa_processor(image, question, return_tensors="pt")
            with torch.no_grad():
                outputs = vqa_model(**inputs)
            predicted_id = outputs.logits.argmax(-1).item()
            answer = vqa_model.config.id2label[predicted_id]

        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return answer, tmp.name

    except Exception as e:
        return f"Error: {e}", None

@app.get("/")
def home():
    return RedirectResponse(url="/templates/home.html")