from fastapi import FastAPI from fastapi.responses import RedirectResponse, JSONResponse, FileResponse import os from PIL import Image from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline from gtts import gTTS import easyocr import torch import tempfile import numpy as np from io import BytesIO app = FastAPI() vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") reader = easyocr.Reader(['en', 'fr']) def classify_question(question: str): q = question.lower() if any(w in q for w in ["text", "say", "written", "read"]): return "ocr" if any(w in q for w in ["caption", "describe", "what is in the image"]): return "caption" return "vqa" def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None mode = classify_question(question) try: if mode == "ocr": result = reader.readtext(np.array(image)) answer = " ".join([entry[1] for entry in result]) or "No readable text found." elif mode == "caption": answer = captioner(image)[0]['generated_text'] else: inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() answer = vqa_model.config.id2label[predicted_id] tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) return answer, tmp.name except Exception as e: return f"Error: {e}", None @app.get("/") def home(): return RedirectResponse(url="/templates/home.html")