Spaces:
Running
on
Zero
Running
on
Zero
Update src/app.py
Browse files- src/app.py +27 -24
src/app.py
CHANGED
@@ -127,8 +127,8 @@ def generate(
|
|
127 |
yield collected_answer # Yield initial part of answer
|
128 |
|
129 |
elif reasoning_started and not answer_started:
|
130 |
-
|
131 |
-
|
132 |
|
133 |
elif answer_started:
|
134 |
collected_answer += text # Accumulate answer tokens
|
@@ -146,7 +146,7 @@ def get_text_from_content(content):
|
|
146 |
if item["type"] == "text":
|
147 |
texts.append(item["text"])
|
148 |
elif item["type"] == "image":
|
149 |
-
texts.append("<
|
150 |
return " ".join(texts)
|
151 |
|
152 |
@spaces.GPU
|
@@ -179,29 +179,32 @@ def chat_inference(image, text, conversation, temperature=VISION_TEMPERATURE, to
|
|
179 |
output = vision_model.generate(**inputs, **generation_kwargs)
|
180 |
assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
|
181 |
|
182 |
-
reasoning = ""
|
183 |
-
answer = ""
|
184 |
-
if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
|
185 |
-
reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
|
186 |
-
reasoning_end = assistant_response.find("</reasoning>")
|
187 |
-
reasoning = assistant_response[reasoning_start:reasoning_end].strip()
|
188 |
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
else: # Fallback if answer end tag is missing (less robust)
|
195 |
-
answer = assistant_response[answer_start:].strip()
|
196 |
-
|
197 |
-
|
198 |
-
formatted_response_content = []
|
199 |
-
if reasoning:
|
200 |
-
formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
|
201 |
-
formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
conversation.append({"role": "assistant", "content": formatted_response_content})
|
205 |
return display_vision_conversation(conversation), conversation
|
206 |
|
207 |
# =============================================================================
|
@@ -238,7 +241,7 @@ def display_vision_conversation(conversation):
|
|
238 |
assistant_content = conversation[i+1]["content"]
|
239 |
assistant_text_parts = []
|
240 |
for item in assistant_content:
|
241 |
-
|
242 |
assistant_text_parts.append(item["text"])
|
243 |
assistant_msg = "\n".join(assistant_text_parts).strip()
|
244 |
i += 2
|
@@ -322,7 +325,7 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, head_paths=head_file_p
|
|
322 |
vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
|
323 |
vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
|
324 |
vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
|
325 |
-
|
326 |
clear_button = gr.Button("Clear Chat")
|
327 |
|
328 |
# Conversation state variables for each branch.
|
|
|
127 |
yield collected_answer # Yield initial part of answer
|
128 |
|
129 |
elif reasoning_started and not answer_started:
|
130 |
+
collected_reasoning = text # Accumulate reasoning tokens
|
131 |
+
yield text # Stream reasoning tokens
|
132 |
|
133 |
elif answer_started:
|
134 |
collected_answer += text # Accumulate answer tokens
|
|
|
146 |
if item["type"] == "text":
|
147 |
texts.append(item["text"])
|
148 |
elif item["type"] == "image":
|
149 |
+
texts.append("<Image>")
|
150 |
return " ".join(texts)
|
151 |
|
152 |
@spaces.GPU
|
|
|
179 |
output = vision_model.generate(**inputs, **generation_kwargs)
|
180 |
assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
### For future versions of Vision with Reasoning
|
184 |
+
vision_reasoning=False
|
185 |
+
if vision_reasoning:
|
186 |
+
reasoning = ""
|
187 |
+
answer = ""
|
188 |
+
if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
|
189 |
+
reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
|
190 |
+
reasoning_end = assistant_response.find("</reasoning>")
|
191 |
+
reasoning = assistant_response[reasoning_start:reasoning_end].strip()
|
192 |
|
193 |
+
answer_start = assistant_response.find("<answer>") + len("<answer>")
|
194 |
+
answer_end = assistant_response.find("</answer>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
+
if answer_end != -1: # Handle cases where answer end tag is present
|
197 |
+
answer = assistant_response[answer_start:answer_end].strip()
|
198 |
+
else: # Fallback if answer end tag is missing (less robust)
|
199 |
+
answer = assistant_response[answer_start:].strip()
|
200 |
+
formatted_response_content = []
|
201 |
+
if reasoning:
|
202 |
+
formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
|
203 |
+
formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
|
204 |
+
conversation.append({"role": "assistant", "content": formatted_response_content})
|
205 |
+
else:
|
206 |
+
conversation.append({"role": "assistant", "content": [{"type": "text", "text": assistant_response.strip()}]})
|
207 |
|
|
|
208 |
return display_vision_conversation(conversation), conversation
|
209 |
|
210 |
# =============================================================================
|
|
|
241 |
assistant_content = conversation[i+1]["content"]
|
242 |
assistant_text_parts = []
|
243 |
for item in assistant_content:
|
244 |
+
if item["type"] == "text":
|
245 |
assistant_text_parts.append(item["text"])
|
246 |
assistant_msg = "\n".join(assistant_text_parts).strip()
|
247 |
i += 2
|
|
|
325 |
vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
|
326 |
vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
|
327 |
vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
|
328 |
+
send_button = gr.Button("Send Message")
|
329 |
clear_button = gr.Button("Clear Chat")
|
330 |
|
331 |
# Conversation state variables for each branch.
|