Spaces:

JosephZ
/

R1-SGG

Running on Zero

App Files Files Community

JosephZ commited on 8 days ago

Commit

8cebbf6

verified ·

1 Parent(s): 1873cee

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -29

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import torch
 from transformers import Qwen2VLForConditionalGeneration, GenerationConfig, AutoProcessor
 import spaces
-from vllm import LLM, SamplingParams
 def extract_answer_content(text: str) -> str:
     """
@@ -63,10 +62,6 @@ SYSTEM_PROMPT = (
 processor = AutoProcessor.from_pretrained("JosephZ/qwen2vl-7b-sft-grpo-close-sgg", max_pixels=1024*28*28)
 device='cuda' if torch.cuda.is_available() else "cpu"
-model_name = "JosephZ/qwen2vl-7b-sft-grpo-close-sgg"
-"""
 model = Qwen2VLForConditionalGeneration.from_pretrained("JosephZ/qwen2vl-7b-sft-grpo-close-sgg",
              torch_dtype=torch.bfloat16,
              device_map=device)
@@ -80,25 +75,9 @@ generation_config=GenerationConfig(
         max_new_tokens=2048,
         use_cache=True
 )
-"""
-model = LLM(
-            model=model_name,
-            limit_mm_per_prompt={"image": 1},
-            dtype='bfloat16',
-            #device=device,
-            max_model_len=4096,
-            mm_processor_kwargs= { "max_pixels": 1024*28*28, "min_pixels": 4*28*28},
-)
-sampling_params = SamplingParams(
-        temperature=0.01,
-        top_k=1,
-        top_p=0.001,
-        repetition_penalty=1.0,
-        max_tokens=2048,
-)
 def build_prompt(image, user_text):
-    base64_image = encode_image_to_base64(image)
     messages = [
         {
             "role": "system",
@@ -107,8 +86,8 @@ def build_prompt(image, user_text):
         {
             "role": "user",
             "content": [
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
-                # {"type": "image"},
                 {"type": "text", "text": user_text},
             ],
         },
@@ -176,17 +155,30 @@ def scale_box(box, scale):
 def generate_sgg(image):
     global model
     iw, ih = image.size
     scale_factors = (iw / 1000.0, ih / 1000.0)
     conversation = build_prompt(image, PROMPT_CLOSE)
     with torch.no_grad():
-        outputs = model.chat([conversation], sampling_params=sampling_params)
-        output_texts = [output.outputs[0].text for output in outputs]
-    output_text = output_texts[0]
     resp = extract_answer_content(output_text)
     try:
@@ -226,4 +218,4 @@ gr.Interface(
     outputs=[gr.Image(type="pil"), gr.Textbox(label="Scene Graph")],
     title="R1-SGG: Compile Scene Graphs with Reinforcement Learning",
     description="Upload an image and generate a structured scene graph in JSON format."
-).launch(share=True)

 from transformers import Qwen2VLForConditionalGeneration, GenerationConfig, AutoProcessor
 import spaces
 def extract_answer_content(text: str) -> str:
     """
 processor = AutoProcessor.from_pretrained("JosephZ/qwen2vl-7b-sft-grpo-close-sgg", max_pixels=1024*28*28)
 device='cuda' if torch.cuda.is_available() else "cpu"
 model = Qwen2VLForConditionalGeneration.from_pretrained("JosephZ/qwen2vl-7b-sft-grpo-close-sgg",
              torch_dtype=torch.bfloat16,
              device_map=device)
         max_new_tokens=2048,
         use_cache=True
 )
 def build_prompt(image, user_text):
+    #base64_image = encode_image_to_base64(image)
     messages = [
         {
             "role": "system",
         {
             "role": "user",
             "content": [
+                #{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
+                 {"type": "image"},
                 {"type": "text", "text": user_text},
             ],
         },
 def generate_sgg(image):
     global model
+    device='cuda' if torch.cuda.is_available() else "cpu"
+    if next(model.parameters()).device != torch.device(device):
+        model = model.to(device)
     iw, ih = image.size
     scale_factors = (iw / 1000.0, ih / 1000.0)
     conversation = build_prompt(image, PROMPT_CLOSE)
+    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+    inputs = processor(
+        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
+    )
+    inputs = inputs.to(model.device)
     with torch.no_grad():
+        output_ids = model.generate(**inputs, generation_config=generation_config)
+    generated_ids = [
+        output_ids[len(input_ids) :]
+        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )[0]
     resp = extract_answer_content(output_text)
     try:
     outputs=[gr.Image(type="pil"), gr.Textbox(label="Scene Graph")],
     title="R1-SGG: Compile Scene Graphs with Reinforcement Learning",
     description="Upload an image and generate a structured scene graph in JSON format."
+).launch(share=True)