Narayana02 commited on
Commit
7ec2b4b
Β·
verified Β·
1 Parent(s): 2cec5fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -57
app.py CHANGED
@@ -1,38 +1,20 @@
1
- import os
2
  import streamlit as st
3
- import onnxruntime as ort
4
- from transformers import AutoTokenizer, AutoProcessor
5
  from PIL import Image
6
  from io import BytesIO
7
 
8
- # Download ONNX models if they do not already exist
9
- if not os.path.exists("vision_encoder_q4f16.onnx"):
10
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
11
- if not os.path.exists("decoder_model_merged_q4f16.onnx"):
12
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
13
- if not os.path.exists("embed_tokens_q4f16.onnx"):
14
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
15
-
16
- # Load tokenizer and processor
17
- tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
18
- processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
19
-
20
- # Load ONNX sessions
21
- vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
22
- decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
23
- embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
24
-
25
  # Streamlit App Configuration
26
- st.set_page_config(page_title="Vision-Based ONNX AI App", page_icon="πŸ€–", layout="wide")
27
- st.title("πŸ–ΌοΈ Vision-Based ONNX AI Demo App")
28
- st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and get a description</p>", unsafe_allow_html=True)
29
 
30
- # User Input: Image Upload
31
  uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
32
- user_prompt = st.text_input("Enter your prompt", value="Describe this image in detail", placeholder="e.g., What is shown in the image?")
33
 
34
- # Display uploaded image
35
- def display_uploaded_image(uploaded_image):
36
  try:
37
  img = Image.open(uploaded_image)
38
  st.image(img, caption="Uploaded Image", use_container_width=True)
@@ -41,57 +23,55 @@ def display_uploaded_image(uploaded_image):
41
  st.error(f"❌ Unable to display image. Error: {e}")
42
  return None
43
 
44
- # Process the uploaded image
45
- if st.button("Get Description"):
46
  if uploaded_image and user_prompt:
47
  try:
48
  # Display the uploaded image
49
- img = display_uploaded_image(uploaded_image)
50
  if img is None:
51
  st.error("❌ Image processing failed.")
52
  st.stop()
53
 
54
- # Preprocess the image
55
  img_buffer = BytesIO()
56
  img.save(img_buffer, format="PNG")
57
  img_bytes = img_buffer.getvalue()
58
- processed_image = processor(images=img, return_tensors="np")
59
-
60
- # Generate embeddings using the vision encoder
61
- vision_embeddings = vision_encoder_session.run(
62
- None, {"pixel_values": processed_image["pixel_values"]}
63
- )[0]
64
 
65
- # Tokenize the user prompt
66
- inputs = tokenizer(user_prompt, return_tensors="np")
67
- input_ids = inputs["input_ids"]
68
-
69
- # Generate embedded tokens
70
- embedded_tokens = embed_tokens_session.run(
71
- None, {"input_ids": input_ids}
72
- )[0]
73
-
74
- # Generate a response using the decoder
75
- decoder_outputs = decoder_session.run(
76
- None, {
77
- "vision_embeddings": vision_embeddings,
78
- "embedded_tokens": embedded_tokens
79
  }
80
- )[0]
 
 
 
 
 
 
 
81
 
82
- # Decode the output
83
- description = tokenizer.decode(decoder_outputs, skip_special_tokens=True)
84
 
85
- # Display the description
86
  st.subheader("πŸ“ Model Response")
87
- st.markdown(f"**Description**: {description}")
88
 
89
  except Exception as e:
90
  st.error(f"❌ An error occurred: {e}")
91
  else:
92
  st.warning("⚠️ Please upload an image and enter a prompt.")
93
 
94
- # UI Enhancements
95
  st.markdown("""
96
  <style>
97
  .stButton>button {
 
 
1
  import streamlit as st
2
+ from huggingface_hub import InferenceClient
3
+ from config import HUGGINGFACE_API_KEY # Import your API key from a separate config file
4
  from PIL import Image
5
  from io import BytesIO
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Streamlit App Configuration
8
+ st.set_page_config(page_title="Llama-3.2 Demo App", page_icon="πŸ€–", layout="wide")
9
+ st.title("πŸ–ΌοΈ Llama-3.2-90B-Vision-Instruct Demo App")
10
+ st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and receive a text description of its content</p>", unsafe_allow_html=True)
11
 
12
+ # User Inputs
13
  uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
14
+ user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
15
 
16
+ # Function to display the uploaded image
17
+ def show_uploaded_image(uploaded_image):
18
  try:
19
  img = Image.open(uploaded_image)
20
  st.image(img, caption="Uploaded Image", use_container_width=True)
 
23
  st.error(f"❌ Unable to display image. Error: {e}")
24
  return None
25
 
26
+ # Process user input
27
+ if st.button("Get Description", key="get_description"):
28
  if uploaded_image and user_prompt:
29
  try:
30
  # Display the uploaded image
31
+ img = show_uploaded_image(uploaded_image)
32
  if img is None:
33
  st.error("❌ Image processing failed.")
34
  st.stop()
35
 
36
+ # Convert the image to bytes for model input
37
  img_buffer = BytesIO()
38
  img.save(img_buffer, format="PNG")
39
  img_bytes = img_buffer.getvalue()
 
 
 
 
 
 
40
 
41
+ # Initialize the InferenceClient
42
+ client = InferenceClient(api_key=HUGGINGFACE_API_KEY)
43
+
44
+ # Define messages for the model
45
+ messages = [
46
+ {
47
+ "role": "user",
48
+ "content": [
49
+ {"type": "text", "text": user_prompt},
50
+ {"type": "image", "image": {"bytes": img_bytes}}
51
+ ]
 
 
 
52
  }
53
+ ]
54
+
55
+ # Call the model
56
+ completion = client.chat.completions.create(
57
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
58
+ messages=messages,
59
+ max_tokens=500
60
+ )
61
 
62
+ # Extract JSON response
63
+ model_response = completion.choices[0].message
64
 
65
+ # Display the result
66
  st.subheader("πŸ“ Model Response")
67
+ st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
68
 
69
  except Exception as e:
70
  st.error(f"❌ An error occurred: {e}")
71
  else:
72
  st.warning("⚠️ Please upload an image and enter a prompt.")
73
 
74
+ # Clean UI Enhancements
75
  st.markdown("""
76
  <style>
77
  .stButton>button {