Qwen-VL

Running on Zero

App Files Files Community

KingNish commited on Sep 17, 2024

Commit

6bf8982

verified ·

1 Parent(s): 1ac43cd

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -11

app.py CHANGED Viewed

@@ -16,18 +16,34 @@ import os
 #     "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
 # }
-def array_to_image_path(image_array):
-    if image_array is None:
         raise ValueError("No image provided. Please upload an image before submitting.")
-    # Convert numpy array to PIL Image
-    img = Image.fromarray(np.uint8(image_array))
     # Generate a unique filename using timestamp
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     filename = f"image_{timestamp}.png"
     # Save the image
-    img.save(filename)
     # Get the full path of the saved image
     full_path = os.path.abspath(filename)
@@ -53,15 +69,12 @@ assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
-def run_example(image, text_input=None, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     image_path = array_to_image_path(image)
     print(image_path)
     model = models[model_id]
     processor = processors[model_id]
-    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
-    image = Image.fromarray(image).convert("RGB")
     messages = [
     {
             "role": "user",
@@ -100,6 +113,54 @@ def run_example(image, text_input=None, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     return output_text[0]
 css = """
   #output {
     height: 500px;
@@ -113,14 +174,25 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab(label="Qwen2-VL-7B Input"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(label="Input Picture")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 text_input = gr.Textbox(label="Question")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
-        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
 demo.queue(api_open=False)
 demo.launch(debug=True)

 #     "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
 # }
+def array_to_image_path(image):
+    if image is None:
+        gr.Warning("No video provided. Please upload an video before submitting.")
         raise ValueError("No image provided. Please upload an image before submitting.")
     # Generate a unique filename using timestamp
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     filename = f"image_{timestamp}.png"
     # Save the image
+    image.save(filename)
+    # Get the full path of the saved image
+    full_path = os.path.abspath(filename)
+    return full_path
+def array_to_video_path(video):
+    if video is None:
+        gr.Warning("No video provided. Please upload an video before submitting.")
+        raise ValueError("No video provided. Please upload an video before submitting.")
+    # Generate a unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"video_{timestamp}.mp4"
+    # Save the image
+    video.save(filename)
     # Get the full path of the saved image
     full_path = os.path.abspath(filename)
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
+def qwen_image(image, text_input=None, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     image_path = array_to_image_path(image)
     print(image_path)
     model = models[model_id]
     processor = processors[model_id]
     messages = [
     {
             "role": "user",
     return output_text[0]
+@spaces.GPU(duration=125)
+def qwen_video(video, text_input=None, model_id="Qwen/Qwen2-VL-7B-Instruct"):
+    video_path = array_to_video_path(video)
+    print(video_path)
+    model = models[model_id]
+    processor = processors[model_id]
+    messages = [
+    {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": video_path,
+                    "max_pixels": 360 * 420,
+                    "fps": 6.0,
+                },
+                {"type": "text", "text": text_input},
+            ],
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=1024)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
 css = """
   #output {
     height: 500px;
     with gr.Tab(label="Qwen2-VL-7B Input"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(label="Input Picture", type="pil")
+                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
+                text_input = gr.Textbox(label="Question")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(qwen_image, [input_img, text_input, model_selector], [output_text])
+    with gr.Tab(label="Qwen2-VL-7B Input"):
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Video(label="Input Picture", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 text_input = gr.Textbox(label="Question")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(qwen_video, [input_img, text_input, model_selector], [output_text])
 demo.queue(api_open=False)
 demo.launch(debug=True)