Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,20 @@
|
|
1 |
-
import os
|
2 |
import streamlit as st
|
3 |
-
|
4 |
-
from
|
5 |
from PIL import Image
|
6 |
from io import BytesIO
|
7 |
|
8 |
-
# Download ONNX models if they do not already exist
|
9 |
-
if not os.path.exists("vision_encoder_q4f16.onnx"):
|
10 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
|
11 |
-
if not os.path.exists("decoder_model_merged_q4f16.onnx"):
|
12 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
13 |
-
if not os.path.exists("embed_tokens_q4f16.onnx"):
|
14 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
15 |
-
|
16 |
-
# Load tokenizer and processor
|
17 |
-
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
18 |
-
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
19 |
-
|
20 |
-
# Load ONNX sessions
|
21 |
-
vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
|
22 |
-
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
23 |
-
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
24 |
-
|
25 |
# Streamlit App Configuration
|
26 |
-
st.set_page_config(page_title="
|
27 |
-
st.title("πΌοΈ Vision-
|
28 |
-
st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and
|
29 |
|
30 |
-
# User
|
31 |
uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
|
32 |
-
user_prompt = st.text_input("Enter your prompt", value="Describe this image in
|
33 |
|
34 |
-
#
|
35 |
-
def
|
36 |
try:
|
37 |
img = Image.open(uploaded_image)
|
38 |
st.image(img, caption="Uploaded Image", use_container_width=True)
|
@@ -41,57 +23,55 @@ def display_uploaded_image(uploaded_image):
|
|
41 |
st.error(f"β Unable to display image. Error: {e}")
|
42 |
return None
|
43 |
|
44 |
-
# Process
|
45 |
-
if st.button("Get Description"):
|
46 |
if uploaded_image and user_prompt:
|
47 |
try:
|
48 |
# Display the uploaded image
|
49 |
-
img =
|
50 |
if img is None:
|
51 |
st.error("β Image processing failed.")
|
52 |
st.stop()
|
53 |
|
54 |
-
#
|
55 |
img_buffer = BytesIO()
|
56 |
img.save(img_buffer, format="PNG")
|
57 |
img_bytes = img_buffer.getvalue()
|
58 |
-
processed_image = processor(images=img, return_tensors="np")
|
59 |
-
|
60 |
-
# Generate embeddings using the vision encoder
|
61 |
-
vision_embeddings = vision_encoder_session.run(
|
62 |
-
None, {"pixel_values": processed_image["pixel_values"]}
|
63 |
-
)[0]
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
None, {
|
77 |
-
"vision_embeddings": vision_embeddings,
|
78 |
-
"embedded_tokens": embedded_tokens
|
79 |
}
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
#
|
83 |
-
|
84 |
|
85 |
-
# Display the
|
86 |
st.subheader("π Model Response")
|
87 |
-
st.markdown(f"**Description**: {description}")
|
88 |
|
89 |
except Exception as e:
|
90 |
st.error(f"β An error occurred: {e}")
|
91 |
else:
|
92 |
st.warning("β οΈ Please upload an image and enter a prompt.")
|
93 |
|
94 |
-
# UI Enhancements
|
95 |
st.markdown("""
|
96 |
<style>
|
97 |
.stButton>button {
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from huggingface_hub import InferenceClient
|
3 |
+
from config import HUGGINGFACE_API_KEY # Import your API key from a separate config file
|
4 |
from PIL import Image
|
5 |
from io import BytesIO
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
# Streamlit App Configuration
|
8 |
+
st.set_page_config(page_title="Llama-3.2 Demo App", page_icon="π€", layout="wide")
|
9 |
+
st.title("πΌοΈ Llama-3.2-90B-Vision-Instruct Demo App")
|
10 |
+
st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and receive a text description of its content</p>", unsafe_allow_html=True)
|
11 |
|
12 |
+
# User Inputs
|
13 |
uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
|
14 |
+
user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
|
15 |
|
16 |
+
# Function to display the uploaded image
|
17 |
+
def show_uploaded_image(uploaded_image):
|
18 |
try:
|
19 |
img = Image.open(uploaded_image)
|
20 |
st.image(img, caption="Uploaded Image", use_container_width=True)
|
|
|
23 |
st.error(f"β Unable to display image. Error: {e}")
|
24 |
return None
|
25 |
|
26 |
+
# Process user input
|
27 |
+
if st.button("Get Description", key="get_description"):
|
28 |
if uploaded_image and user_prompt:
|
29 |
try:
|
30 |
# Display the uploaded image
|
31 |
+
img = show_uploaded_image(uploaded_image)
|
32 |
if img is None:
|
33 |
st.error("β Image processing failed.")
|
34 |
st.stop()
|
35 |
|
36 |
+
# Convert the image to bytes for model input
|
37 |
img_buffer = BytesIO()
|
38 |
img.save(img_buffer, format="PNG")
|
39 |
img_bytes = img_buffer.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Initialize the InferenceClient
|
42 |
+
client = InferenceClient(api_key=HUGGINGFACE_API_KEY)
|
43 |
+
|
44 |
+
# Define messages for the model
|
45 |
+
messages = [
|
46 |
+
{
|
47 |
+
"role": "user",
|
48 |
+
"content": [
|
49 |
+
{"type": "text", "text": user_prompt},
|
50 |
+
{"type": "image", "image": {"bytes": img_bytes}}
|
51 |
+
]
|
|
|
|
|
|
|
52 |
}
|
53 |
+
]
|
54 |
+
|
55 |
+
# Call the model
|
56 |
+
completion = client.chat.completions.create(
|
57 |
+
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
58 |
+
messages=messages,
|
59 |
+
max_tokens=500
|
60 |
+
)
|
61 |
|
62 |
+
# Extract JSON response
|
63 |
+
model_response = completion.choices[0].message
|
64 |
|
65 |
+
# Display the result
|
66 |
st.subheader("π Model Response")
|
67 |
+
st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
|
68 |
|
69 |
except Exception as e:
|
70 |
st.error(f"β An error occurred: {e}")
|
71 |
else:
|
72 |
st.warning("β οΈ Please upload an image and enter a prompt.")
|
73 |
|
74 |
+
# Clean UI Enhancements
|
75 |
st.markdown("""
|
76 |
<style>
|
77 |
.stButton>button {
|