import streamlit as st
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
import torch

# Load the VILT processor and model for visual question answering
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Streamlit app UI
st.title("Visual Question Answering (VQA) with VILT")

# Image uploader
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

# Question input
question = st.text_input("Enter your question about the image:")

# A button to trigger the VQA task
if st.button("Get Answer"):
    if uploaded_image is None:
        st.error("Please upload an image.")
    elif question == "":
        st.error("Please enter a question.")
    else:
        try:
            # Load the image from the uploader
            image = Image.open(uploaded_image)

            # Show the uploaded image in the app
            st.image(image, caption="Uploaded Image", use_column_width=True)

            # Process the image and question
            encoding = processor(image, question, return_tensors="pt")

            # Forward pass through the model
            outputs = model(**encoding)
            logits = outputs.logits
            idx = logits.argmax(-1).item()

            # Get the predicted answer
            answer = model.config.id2label[idx]

            # Show the answer
            st.success(f"Predicted Answer: {answer}")

        except Exception as e:
            st.error(f"Error: {str(e)}")