Kilos1 commited on
Commit
612c5f5
·
verified ·
1 Parent(s): ec744ec

Update multimodal_queries.py

Browse files
Files changed (1) hide show
  1. multimodal_queries.py +52 -67
multimodal_queries.py CHANGED
@@ -1,87 +1,72 @@
1
  import re
2
  import base64
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
4
  import gradio as gr
5
  from PIL import Image
6
- import io
7
- from transformers import Owlv2Processor, Owlv2ForObjectDetection
8
-
9
- processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-finetuned")
10
- model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-finetuned")
11
-
12
- def input_image_setup(uploaded_file):
13
- """
14
- Encodes the uploaded image file into a base64 string.
15
-
16
- Parameters:
17
- - uploaded_file: File-like object uploaded via Gradio.
18
 
19
- Returns:
20
- - encoded_image (str): Base64 encoded string of the image data.
21
- """
22
- if uploaded_file is not None:
23
- bytes_data = uploaded_file.read()
24
- encoded_image = base64.b64encode(bytes_data).decode("utf-8")
25
- return encoded_image
26
- else:
27
- raise FileNotFoundError("No file uploaded")
28
-
29
- def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
30
- """
31
- Sends an image and a query to the model and retrieves the description or answer.
32
-
33
- Parameters:
34
- - encoded_image (str): Base64-encoded image string.
35
- - user_query (str): The user's question about the image.
36
- - assistant_prompt (str): Optional prompt to guide the model's response.
37
 
38
- Returns:
39
- - str: The model's response for the given image and query.
40
  """
 
41
 
42
- # Prepare input for the model
43
- input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
44
-
45
- # Tokenize input text
46
- inputs = tokenizer(input_text, return_tensors="pt")
47
-
48
- # Generate response from the model
49
- outputs = model.generate(**inputs)
50
-
51
- # Decode and return the model's response
52
- response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
-
54
- return response_text
55
-
56
- def process_image_and_query(uploaded_file, user_query):
57
- """
58
- Process the uploaded image and user query to generate a response from the model.
59
-
60
  Parameters:
61
- - uploaded_file: The uploaded image file.
62
  - user_query: The user's question about the image.
63
 
64
  Returns:
65
- - str: The generated response from the model.
66
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # Encode the uploaded image
69
- encoded_image = input_image_setup(uploaded_file)
70
-
71
- # Generate response using the encoded image and user query
72
- response = generate_model_response(encoded_image, user_query)
73
-
74
- return response
75
 
76
- # Create Gradio interface
77
  iface = gr.Interface(
78
- fn=process_image_and_query,
79
  inputs=[
80
- gr.inputs.Image(type="file", label="Upload Image"),
81
- gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
82
  ],
83
- outputs="text",
84
  )
85
 
86
- # Launch the Gradio app
87
- iface.launch()
 
1
  import re
2
  import base64
3
+ import io
4
+ import torch
5
  import gradio as gr
6
  from PIL import Image
7
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Load the model and processor
10
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
11
+ model = MllamaForConditionalGeneration.from_pretrained(
12
+ model_id,
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="auto",
15
+ )
16
+ processor = AutoProcessor.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def generate_model_response(image_file, user_query):
 
19
  """
20
+ Processes the uploaded image and user query to generate a response from the model.
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  Parameters:
23
+ - image_file: The uploaded image file.
24
  - user_query: The user's question about the image.
25
 
26
  Returns:
27
+ - str: The generated response from the model, formatted as HTML.
28
  """
29
+ try:
30
+ # Load and prepare the image
31
+ raw_image = Image.open(image_file).convert("RGB")
32
+
33
+ # Prepare input for the model using the processor
34
+ conversation = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "image", "url": "<|image|>"}, # Placeholder for image
39
+ {"type": "text", "text": user_query}
40
+ ]
41
+ }
42
+ ]
43
+
44
+ # Apply chat template to prepare inputs for the model
45
+ inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
46
+
47
+ # Process the image and text inputs together
48
+ inputs = processor(inputs, raw_image, return_tensors="pt").to(model.device)
49
+
50
+ # Generate response from the model
51
+ outputs = model.generate(**inputs)
52
+
53
+ # Decode and format the response
54
+ generated_text = processor.decode(outputs[0], skip_special_tokens=True)
55
+
56
+ return generated_text
57
 
58
+ except Exception as e:
59
+ print(f"Error in generating response: {e}")
60
+ return f"<p>An error occurred: {str(e)}</p>"
 
 
 
 
61
 
62
+ # Gradio Interface
63
  iface = gr.Interface(
64
+ fn=generate_model_response,
65
  inputs=[
66
+ gr.Image(type="file", label="Upload Image"),
67
+ gr.Textbox(label="Enter your question", placeholder="How many calories are in this food?")
68
  ],
69
+ outputs=gr.HTML(label="Response from Model"),
70
  )
71
 
72
+ iface.launch(share=True)