Spaces:

Kilos1
/

Nutrition_App

Runtime error

App Files Files Community

Kilos1 commited on Mar 8

Commit

612c5f5

verified ·

1 Parent(s): ec744ec

Update multimodal_queries.py

Browse files

Files changed (1) hide show

multimodal_queries.py +52 -67

multimodal_queries.py CHANGED Viewed

@@ -1,87 +1,72 @@
 import re
 import base64
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 from PIL import Image
-import io
-from transformers import Owlv2Processor, Owlv2ForObjectDetection
-processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-finetuned")
-model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-finetuned")
-def input_image_setup(uploaded_file):
-    """
-    Encodes the uploaded image file into a base64 string.
-    Parameters:
-    - uploaded_file: File-like object uploaded via Gradio.
-    Returns:
-    - encoded_image (str): Base64 encoded string of the image data.
-    """
-    if uploaded_file is not None:
-        bytes_data = uploaded_file.read()
-        encoded_image = base64.b64encode(bytes_data).decode("utf-8")
-        return encoded_image
-    else:
-        raise FileNotFoundError("No file uploaded")
-def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
-    """
-    Sends an image and a query to the model and retrieves the description or answer.
-    Parameters:
-    - encoded_image (str): Base64-encoded image string.
-    - user_query (str): The user's question about the image.
-    - assistant_prompt (str): Optional prompt to guide the model's response.
-    Returns:
-    - str: The model's response for the given image and query.
     """
-    # Prepare input for the model
-    input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
-    # Tokenize input text
-    inputs = tokenizer(input_text, return_tensors="pt")
-    # Generate response from the model
-    outputs = model.generate(**inputs)
-    # Decode and return the model's response
-    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response_text
-def process_image_and_query(uploaded_file, user_query):
-    """
-    Process the uploaded image and user query to generate a response from the model.
     Parameters:
-    - uploaded_file: The uploaded image file.
     - user_query: The user's question about the image.
     Returns:
-    - str: The generated response from the model.
     """
-    # Encode the uploaded image
-    encoded_image = input_image_setup(uploaded_file)
-    # Generate response using the encoded image and user query
-    response = generate_model_response(encoded_image, user_query)
-    return response
-# Create Gradio interface
 iface = gr.Interface(
-    fn=process_image_and_query,
     inputs=[
-        gr.inputs.Image(type="file", label="Upload Image"),
-        gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
     ],
-    outputs="text",
 )
-# Launch the Gradio app
-iface.launch()

 import re
 import base64
+import io
+import torch
 import gradio as gr
 from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+# Load the model and processor
+model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+model = MllamaForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(model_id)
+def generate_model_response(image_file, user_query):
     """
+    Processes the uploaded image and user query to generate a response from the model.
     Parameters:
+    - image_file: The uploaded image file.
     - user_query: The user's question about the image.
     Returns:
+    - str: The generated response from the model, formatted as HTML.
     """
+    try:
+        # Load and prepare the image
+        raw_image = Image.open(image_file).convert("RGB")
+        # Prepare input for the model using the processor
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "<|image|>"},  # Placeholder for image
+                    {"type": "text", "text": user_query}
+                ]
+            }
+        ]
+        # Apply chat template to prepare inputs for the model
+        inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        # Process the image and text inputs together
+        inputs = processor(inputs, raw_image, return_tensors="pt").to(model.device)
+        # Generate response from the model
+        outputs = model.generate(**inputs)
+        # Decode and format the response
+        generated_text = processor.decode(outputs[0], skip_special_tokens=True)
+        return generated_text
+    except Exception as e:
+        print(f"Error in generating response: {e}")
+        return f"<p>An error occurred: {str(e)}</p>"
+# Gradio Interface
 iface = gr.Interface(
+    fn=generate_model_response,
     inputs=[
+        gr.Image(type="file", label="Upload Image"),
+        gr.Textbox(label="Enter your question", placeholder="How many calories are in this food?")
     ],
+    outputs=gr.HTML(label="Response from Model"),
 )
+iface.launch(share=True)