Spaces:

Narayana02
/

image_caption_description

Sleeping

App Files Files Community

Narayana02 commited on Dec 20, 2024

Commit

7ec2b4b

verified ·

1 Parent(s): 2cec5fb

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -57

app.py CHANGED Viewed

@@ -1,38 +1,20 @@
-import os
 import streamlit as st
-import onnxruntime as ort
-from transformers import AutoTokenizer, AutoProcessor
 from PIL import Image
 from io import BytesIO
-# Download ONNX models if they do not already exist
-if not os.path.exists("vision_encoder_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
-if not os.path.exists("decoder_model_merged_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
-if not os.path.exists("embed_tokens_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
-# Load tokenizer and processor
-tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
-# Load ONNX sessions
-vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
-decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
-embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
 # Streamlit App Configuration
-st.set_page_config(page_title="Vision-Based ONNX AI App", page_icon="🤖", layout="wide")
-st.title("🖼️ Vision-Based ONNX AI Demo App")
-st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and get a description</p>", unsafe_allow_html=True)
-# User Input: Image Upload
 uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
-user_prompt = st.text_input("Enter your prompt", value="Describe this image in detail", placeholder="e.g., What is shown in the image?")
-# Display uploaded image
-def display_uploaded_image(uploaded_image):
     try:
         img = Image.open(uploaded_image)
         st.image(img, caption="Uploaded Image", use_container_width=True)
@@ -41,57 +23,55 @@ def display_uploaded_image(uploaded_image):
         st.error(f"❌ Unable to display image. Error: {e}")
         return None
-# Process the uploaded image
-if st.button("Get Description"):
     if uploaded_image and user_prompt:
         try:
             # Display the uploaded image
-            img = display_uploaded_image(uploaded_image)
             if img is None:
                 st.error("❌ Image processing failed.")
                 st.stop()
-            # Preprocess the image
             img_buffer = BytesIO()
             img.save(img_buffer, format="PNG")
             img_bytes = img_buffer.getvalue()
-            processed_image = processor(images=img, return_tensors="np")
-            # Generate embeddings using the vision encoder
-            vision_embeddings = vision_encoder_session.run(
-                None, {"pixel_values": processed_image["pixel_values"]}
-            )[0]
-            # Tokenize the user prompt
-            inputs = tokenizer(user_prompt, return_tensors="np")
-            input_ids = inputs["input_ids"]
-            # Generate embedded tokens
-            embedded_tokens = embed_tokens_session.run(
-                None, {"input_ids": input_ids}
-            )[0]
-            # Generate a response using the decoder
-            decoder_outputs = decoder_session.run(
-                None, {
-                    "vision_embeddings": vision_embeddings,
-                    "embedded_tokens": embedded_tokens
                 }
-            )[0]
-            # Decode the output
-            description = tokenizer.decode(decoder_outputs, skip_special_tokens=True)
-            # Display the description
             st.subheader("📝 Model Response")
-            st.markdown(f"**Description**: {description}")
         except Exception as e:
             st.error(f"❌ An error occurred: {e}")
     else:
         st.warning("⚠️ Please upload an image and enter a prompt.")
-# UI Enhancements
 st.markdown("""
     <style>
         .stButton>button {

 import streamlit as st
+from huggingface_hub import InferenceClient
+from config import HUGGINGFACE_API_KEY  # Import your API key from a separate config file
 from PIL import Image
 from io import BytesIO
 # Streamlit App Configuration
+st.set_page_config(page_title="Llama-3.2 Demo App", page_icon="🤖", layout="wide")
+st.title("🖼️ Llama-3.2-90B-Vision-Instruct Demo App")
+st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and receive a text description of its content</p>", unsafe_allow_html=True)
+# User Inputs
 uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
+user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
+# Function to display the uploaded image
+def show_uploaded_image(uploaded_image):
     try:
         img = Image.open(uploaded_image)
         st.image(img, caption="Uploaded Image", use_container_width=True)
         st.error(f"❌ Unable to display image. Error: {e}")
         return None
+# Process user input
+if st.button("Get Description", key="get_description"):
     if uploaded_image and user_prompt:
         try:
             # Display the uploaded image
+            img = show_uploaded_image(uploaded_image)
             if img is None:
                 st.error("❌ Image processing failed.")
                 st.stop()
+            # Convert the image to bytes for model input
             img_buffer = BytesIO()
             img.save(img_buffer, format="PNG")
             img_bytes = img_buffer.getvalue()
+            # Initialize the InferenceClient
+            client = InferenceClient(api_key=HUGGINGFACE_API_KEY)
+            # Define messages for the model
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_prompt},
+                        {"type": "image", "image": {"bytes": img_bytes}}
+                    ]
                 }
+            ]
+            # Call the model
+            completion = client.chat.completions.create(
+                model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+                messages=messages,
+                max_tokens=500
+            )
+            # Extract JSON response
+            model_response = completion.choices[0].message
+            # Display the result
             st.subheader("📝 Model Response")
+            st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
         except Exception as e:
             st.error(f"❌ An error occurred: {e}")
     else:
         st.warning("⚠️ Please upload an image and enter a prompt.")
+# Clean UI Enhancements
 st.markdown("""
     <style>
         .stButton>button {