ruslanmv commited on
Commit
aec88ad
·
verified ·
1 Parent(s): b0cdadf

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +27 -24
src/app.py CHANGED
@@ -127,8 +127,8 @@ def generate(
127
  yield collected_answer # Yield initial part of answer
128
 
129
  elif reasoning_started and not answer_started:
130
- collected_reasoning = text # Accumulate reasoning tokens
131
- yield text # Stream reasoning tokens
132
 
133
  elif answer_started:
134
  collected_answer += text # Accumulate answer tokens
@@ -146,7 +146,7 @@ def get_text_from_content(content):
146
  if item["type"] == "text":
147
  texts.append(item["text"])
148
  elif item["type"] == "image":
149
- texts.append("<image>")
150
  return " ".join(texts)
151
 
152
  @spaces.GPU
@@ -179,29 +179,32 @@ def chat_inference(image, text, conversation, temperature=VISION_TEMPERATURE, to
179
  output = vision_model.generate(**inputs, **generation_kwargs)
180
  assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
181
 
182
- reasoning = ""
183
- answer = ""
184
- if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
185
- reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
186
- reasoning_end = assistant_response.find("</reasoning>")
187
- reasoning = assistant_response[reasoning_start:reasoning_end].strip()
188
 
189
- answer_start = assistant_response.find("<answer>") + len("<answer>")
190
- answer_end = assistant_response.find("</answer>")
 
 
 
 
 
 
 
191
 
192
- if answer_end != -1: # Handle cases where answer end tag is present
193
- answer = assistant_response[answer_start:answer_end].strip()
194
- else: # Fallback if answer end tag is missing (less robust)
195
- answer = assistant_response[answer_start:].strip()
196
-
197
-
198
- formatted_response_content = []
199
- if reasoning:
200
- formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
201
- formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
202
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- conversation.append({"role": "assistant", "content": formatted_response_content})
205
  return display_vision_conversation(conversation), conversation
206
 
207
  # =============================================================================
@@ -238,7 +241,7 @@ def display_vision_conversation(conversation):
238
  assistant_content = conversation[i+1]["content"]
239
  assistant_text_parts = []
240
  for item in assistant_content:
241
- if item["type"] == "text":
242
  assistant_text_parts.append(item["text"])
243
  assistant_msg = "\n".join(assistant_text_parts).strip()
244
  i += 2
@@ -322,7 +325,7 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, head_paths=head_file_p
322
  vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
323
  vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
324
  vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
325
- send_button = gr.Button("Send Message")
326
  clear_button = gr.Button("Clear Chat")
327
 
328
  # Conversation state variables for each branch.
 
127
  yield collected_answer # Yield initial part of answer
128
 
129
  elif reasoning_started and not answer_started:
130
+ collected_reasoning = text # Accumulate reasoning tokens
131
+ yield text # Stream reasoning tokens
132
 
133
  elif answer_started:
134
  collected_answer += text # Accumulate answer tokens
 
146
  if item["type"] == "text":
147
  texts.append(item["text"])
148
  elif item["type"] == "image":
149
+ texts.append("<Image>")
150
  return " ".join(texts)
151
 
152
  @spaces.GPU
 
179
  output = vision_model.generate(**inputs, **generation_kwargs)
180
  assistant_response = vision_processor.decode(output[0], skip_special_tokens=True)
181
 
 
 
 
 
 
 
182
 
183
+ ### For future versions of Vision with Reasoning
184
+ vision_reasoning=False
185
+ if vision_reasoning:
186
+ reasoning = ""
187
+ answer = ""
188
+ if "<reasoning>" in assistant_response and "<answer>" in assistant_response:
189
+ reasoning_start = assistant_response.find("<reasoning>") + len("<reasoning>")
190
+ reasoning_end = assistant_response.find("</reasoning>")
191
+ reasoning = assistant_response[reasoning_start:reasoning_end].strip()
192
 
193
+ answer_start = assistant_response.find("<answer>") + len("<answer>")
194
+ answer_end = assistant_response.find("</answer>")
 
 
 
 
 
 
 
 
195
 
196
+ if answer_end != -1: # Handle cases where answer end tag is present
197
+ answer = assistant_response[answer_start:answer_end].strip()
198
+ else: # Fallback if answer end tag is missing (less robust)
199
+ answer = assistant_response[answer_start:].strip()
200
+ formatted_response_content = []
201
+ if reasoning:
202
+ formatted_response_content.append({"type": "text", "text": f"[Reasoning]: {reasoning}"})
203
+ formatted_response_content.append({"type": "text", "text": f"[Answer]: {answer}"})
204
+ conversation.append({"role": "assistant", "content": formatted_response_content})
205
+ else:
206
+ conversation.append({"role": "assistant", "content": [{"type": "text", "text": assistant_response.strip()}]})
207
 
 
208
  return display_vision_conversation(conversation), conversation
209
 
210
  # =============================================================================
 
241
  assistant_content = conversation[i+1]["content"]
242
  assistant_text_parts = []
243
  for item in assistant_content:
244
+ if item["type"] == "text":
245
  assistant_text_parts.append(item["text"])
246
  assistant_msg = "\n".join(assistant_text_parts).strip()
247
  i += 2
 
325
  vision_top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, value=VISION_TOP_P, step=0.01, label="Vision Top p", elem_classes=["gr_accordion_element"])
326
  vision_top_k_slider = gr.Slider(minimum=0, maximum=100, value=VISION_TOP_K, step=1, label="Vision Top k", elem_classes=["gr_accordion_element"])
327
  vision_max_tokens_slider = gr.Slider(minimum=10, maximum=300, value=VISION_MAX_TOKENS, step=1, label="Vision Max Tokens", elem_classes=["gr_accordion_element"])
328
+ send_button = gr.Button("Send Message")
329
  clear_button = gr.Button("Clear Chat")
330
 
331
  # Conversation state variables for each branch.