Spaces:

maahin
/

paligemma

Running

App Files Files Community

maahin commited on Feb 21

Commit

cc851d6

verified ·

1 Parent(s): 2379c94

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -24

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import os
 import streamlit as st
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForVision2Seq
-# Get Hugging Face API key from Hugging Face Spaces secrets
 HF_TOKEN = os.getenv("HF_KEY")
 # Ensure API key is available
@@ -12,12 +12,12 @@ if not HF_TOKEN:
     st.error("❌ Hugging Face API key not found! Set it as 'HF_KEY' in Spaces secrets.")
     st.stop()
-# Load the PaliGemma model and processor
 @st.cache_resource
 def load_model():
-    model_name = "google/paligemma2-3b-mix-224"
-    processor = AutoProcessor.from_pretrained(model_name, token=HF_TOKEN)
-    model = AutoModelForVision2Seq.from_pretrained(model_name, token=HF_TOKEN)
     return processor, model
 processor, model = load_model()
@@ -31,33 +31,23 @@ if uploaded_file:
     image = Image.open(uploaded_file).convert("RGB")
     st.image(image, caption="Uploaded Image", use_container_width=True)
-    # User selects the task
     task = st.selectbox(
         "Select a task:",
         ["Generate a caption", "Answer a question", "Detect objects", "Generate segmentation"]
     )
-    # User input for question/prompt
     prompt = st.text_area("Enter a prompt (e.g., 'Describe the image' or 'What objects are present?')")
     if st.button("Run"):
         if prompt:
-            inputs = processor(text=prompt, images=image, return_tensors="pt")
-            with torch.no_grad():
-                output = model.generate(**inputs)
-            raw_output = processor.batch_decode(output, skip_special_tokens=False)[0]
-            # Handle different outputs
-            if task == "Generate a caption":
-                answer = raw_output
-            elif task == "Answer a question":
-                answer = raw_output
-            elif task == "Detect objects":
-                answer = f"Object bounding boxes: {raw_output}"
-            elif task == "Generate segmentation":
-                answer = f"Segmentation codes: {raw_output}"
             st.success(f"✅ Result: {answer}")

 import streamlit as st
 from PIL import Image
 import torch
+from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
+# Get Hugging Face API key from environment variables
 HF_TOKEN = os.getenv("HF_KEY")
 # Ensure API key is available
     st.error("❌ Hugging Face API key not found! Set it as 'HF_KEY' in Spaces secrets.")
     st.stop()
+# Load the model and processor
 @st.cache_resource
 def load_model():
+    model_id = "google/paligemma2-3b-mix-224"
+    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval()
+    processor = PaliGemmaProcessor.from_pretrained(model_id)
     return processor, model
 processor, model = load_model()
     image = Image.open(uploaded_file).convert("RGB")
     st.image(image, caption="Uploaded Image", use_container_width=True)
+    # User input for task selection
     task = st.selectbox(
         "Select a task:",
         ["Generate a caption", "Answer a question", "Detect objects", "Generate segmentation"]
     )
+    # User prompt
     prompt = st.text_area("Enter a prompt (e.g., 'Describe the image' or 'What objects are present?')")
     if st.button("Run"):
         if prompt:
+            inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to(model.device)
+            input_len = inputs["input_ids"].shape[-1]  # Get input length
+            with torch.inference_mode():
+                generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+                generation = generation[0][input_len:]  # Remove input tokens from output
+                answer = processor.decode(generation, skip_special_tokens=True)
             st.success(f"✅ Result: {answer}")