Spaces:
Running
Running
File size: 5,225 Bytes
f23d324 0000b07 f23d324 ba47cad f23d324 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 ba47cad c255de1 b8dc4cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
"""from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from transformers import pipeline, ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
from PIL import Image
import torch
import fitz # PyMuPDF for PDF
app = FastAPI()
# ========== Image QA Setup ==========
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
def answer_question_from_image(image, question):
if image is None or not question.strip():
return "Please upload an image and ask a question."
inputs = vqa_processor(image, question, return_tensors="pt")
with torch.no_grad():
outputs = vqa_model(**inputs)
predicted_id = outputs.logits.argmax(-1).item()
return vqa_model.config.id2label[predicted_id]
# ========== Gradio Interfaces ==========
img_interface = gr.Interface(
fn=answer_question_from_image,
inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Ask a Question")],
outputs="text",
title="Image Question Answering"
)
# ========== Combine and Mount ==========
demo = gr.TabbedInterface( img_interface , "Image QA")
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/")
def root():
return RedirectResponse(url="/") """
"""from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
# Load image QA model once
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
def answer_question_from_image(image, question):
if image is None or not question.strip():
return "Please upload an image and ask a question."
inputs = vqa_processor(image, question, return_tensors="pt")
with torch.no_grad():
outputs = vqa_model(**inputs)
predicted_id = outputs.logits.argmax(-1).item()
return vqa_model.config.id2label[predicted_id]"""
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
import os
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
from gtts import gTTS
import easyocr
import torch
import tempfile
import numpy as np
from io import BytesIO
app = FastAPI()
# Load models
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
reader = easyocr.Reader(['en', 'fr'])
def classify_question(question: str):
question_lower = question.lower()
if any(word in question_lower for word in ["text", "say", "written", "read"]):
return "ocr"
elif any(word in question_lower for word in ["caption", "describe", "what is in the image"]):
return "caption"
else:
return "vqa"
def answer_question_from_image(image, question):
if image is None or not question.strip():
return "Please upload an image and ask a question.", None
mode = classify_question(question)
if mode == "ocr":
try:
result = reader.readtext(np.array(image))
text = " ".join([entry[1] for entry in result])
answer = text.strip() or "No readable text found."
except Exception as e:
answer = f"OCR Error: {e}"
elif mode == "caption":
try:
answer = captioner(image)[0]['generated_text']
except Exception as e:
answer = f"Captioning error: {e}"
else:
try:
inputs = vqa_processor(image, question, return_tensors="pt")
with torch.no_grad():
outputs = vqa_model(**inputs)
predicted_id = outputs.logits.argmax(-1).item()
answer = vqa_model.config.id2label[predicted_id]
except Exception as e:
answer = f"VQA error: {e}"
try:
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
except Exception as e:
return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None
return answer, audio_path
@app.post("/predict")
async def predict(question: str = Form(...), file: UploadFile = Form(...)):
try:
image_data = await file.read()
image = Image.open(BytesIO(image_data)).convert("RGB")
answer, audio_path = answer_question_from_image(image, question)
if audio_path and os.path.exists(audio_path):
return JSONResponse({"answer": answer, "audio": f"/audio/{os.path.basename(audio_path)}"})
else:
return JSONResponse({"answer": answer})
except Exception as e:
return JSONResponse({"error": str(e)})
@app.get("/audio/{filename}")
async def get_audio(filename: str):
filepath = os.path.join(tempfile.gettempdir(), filename)
return FileResponse(filepath, media_type="audio/mpeg")
@app.get("/")
def home():
return RedirectResponse(url="/templates/home.html")
|