ankandrew commited on
Commit
b3d5d95
·
1 Parent(s): c5c055b
Files changed (1) hide show
  1. app.py +46 -8
app.py CHANGED
@@ -21,7 +21,7 @@ MODEL_NAMES = {
21
 
22
 
23
  @spaces.GPU(duration=300)
24
- def run_inference(model_key, input_type, text, image, video, fps):
25
  """
26
  Load the selected Qwen2.5-VL model and run inference on text, image, or video.
27
  """
@@ -54,11 +54,17 @@ def run_inference(model_key, input_type, text, image, video, fps):
54
  video_src = video if str(video).startswith("file://") else f"file://{video}"
55
  content.append({"type": "video", "video": video_src, "fps": fps})
56
  content.append({"type": "text", "text": text or ""})
57
- msg = [{"role": "user", "content": content}]
 
 
 
58
 
59
  # Prepare inputs for model with video kwargs
60
  text_prompt = processor.apply_chat_template(
61
- msg, tokenize=False, add_generation_prompt=True
 
 
 
62
  )
63
  image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
64
  inputs = processor(
@@ -83,16 +89,39 @@ with demo:
83
  gr.Markdown("# Qwen2.5-VL Multimodal Demo")
84
  model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
85
  input_type = gr.Radio(["text", "image", "video"], label="Input Type")
86
- text_input = gr.Textbox(lines=3, placeholder="Enter text...", visible=True)
87
- image_input = gr.Image(type="filepath", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  video_input = gr.Video(visible=False)
89
- fps_input = gr.Slider(minimum=0.1, maximum=30.0, step=0.1, value=2.0, label="FPS", visible=False)
 
 
 
 
90
  output = gr.Textbox(label="Output")
91
 
92
  # Show/hide inputs based on selection
93
  def update_inputs(choice):
94
  return (
95
- gr.update(visible=(choice == "text")),
96
  gr.update(visible=(choice == "image")),
97
  gr.update(visible=(choice == "video")),
98
  gr.update(visible=(choice == "video"))
@@ -102,7 +131,16 @@ with demo:
102
  run_btn = gr.Button("Generate")
103
  run_btn.click(
104
  run_inference,
105
- [model_select, input_type, text_input, image_input, video_input, fps_input],
 
 
 
 
 
 
 
 
 
106
  output
107
  )
108
 
 
21
 
22
 
23
  @spaces.GPU(duration=300)
24
+ def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
25
  """
26
  Load the selected Qwen2.5-VL model and run inference on text, image, or video.
27
  """
 
54
  video_src = video if str(video).startswith("file://") else f"file://{video}"
55
  content.append({"type": "video", "video": video_src, "fps": fps})
56
  content.append({"type": "text", "text": text or ""})
57
+ msg = [
58
+ {"role": "system", "content": system_prompt},
59
+ {"role": "user", "content": content}
60
+ ]
61
 
62
  # Prepare inputs for model with video kwargs
63
  text_prompt = processor.apply_chat_template(
64
+ msg,
65
+ tokenize=False,
66
+ add_generation_prompt=True,
67
+ add_vision_id=add_vision_id
68
  )
69
  image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
70
  inputs = processor(
 
89
  gr.Markdown("# Qwen2.5-VL Multimodal Demo")
90
  model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
91
  input_type = gr.Radio(["text", "image", "video"], label="Input Type")
92
+ system_prompt_input = gr.Textbox(
93
+ lines=2,
94
+ placeholder="System prompt…",
95
+ value="You are a helpful assistant.",
96
+ label="System Prompt"
97
+ )
98
+ vision_id_checkbox = gr.Checkbox(
99
+ label="Add vision ID",
100
+ value=False
101
+ )
102
+ text_input = gr.Textbox(
103
+ lines=3,
104
+ placeholder="Enter text ...",
105
+ visible=True
106
+ )
107
+ image_input = gr.File(
108
+ file_count="multiple",
109
+ file_types=["image"],
110
+ label="Upload Images",
111
+ visible=False
112
+ )
113
  video_input = gr.Video(visible=False)
114
+ fps_input = gr.Number(
115
+ value=2.0,
116
+ label="FPS",
117
+ visible=False
118
+ )
119
  output = gr.Textbox(label="Output")
120
 
121
  # Show/hide inputs based on selection
122
  def update_inputs(choice):
123
  return (
124
+ gr.update(visible=True),
125
  gr.update(visible=(choice == "image")),
126
  gr.update(visible=(choice == "video")),
127
  gr.update(visible=(choice == "video"))
 
131
  run_btn = gr.Button("Generate")
132
  run_btn.click(
133
  run_inference,
134
+ [
135
+ model_select,
136
+ input_type,
137
+ text_input,
138
+ image_input,
139
+ video_input,
140
+ fps_input,
141
+ system_prompt_input,
142
+ vision_id_checkbox
143
+ ],
144
  output
145
  )
146