"""from fastapi import FastAPI from fastapi.responses import RedirectResponse import gradio as gr from transformers import pipeline, ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM from PIL import Image import torch import fitz # PyMuPDF for PDF app = FastAPI() # ========== Image QA Setup ========== vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question." inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() return vqa_model.config.id2label[predicted_id] # ========== Gradio Interfaces ========== img_interface = gr.Interface( fn=answer_question_from_image, inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Ask a Question")], outputs="text", title="Image Question Answering" ) # ========== Combine and Mount ========== demo = gr.TabbedInterface( img_interface , "Image QA") app = gr.mount_gradio_app(app, demo, path="/") @app.get("/") def root(): return RedirectResponse(url="/") """ from transformers import ViltProcessor, ViltForQuestionAnswering import torch # Load image QA model once vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question." inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() return vqa_model.config.id2label[predicted_id]