Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 27 days ago

Commit

ce67cd9

verified ·

1 Parent(s): f0fbb06

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -20

app.py CHANGED Viewed

@@ -67,7 +67,7 @@ def initialize_model():
     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
-        # FIX: Remove explicit DeviceType. Let the library infer or use string if needed by constructor.
         # The simple constructor often works by detecting the installed ORT package.
         logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
         model = og.Model(model_path) # Simplified model loading
@@ -105,19 +105,18 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
-    # logging.debug(f"Full Formatted Prompt:\n{full_prompt}")
     try:
         input_tokens = tokenizer.encode(full_prompt)
         search_options = {
             "max_length": max_length,
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "do_sample": True,
-            "eos_token_id": tokenizer.eos_token_id,
-            "pad_token_id": tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
         }
         params = og.GeneratorParams(model)
@@ -131,29 +130,23 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
         first_token_time = None
         token_count = 0
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
             if first_token_time is None:
                  first_token_time = time.time() # Record time to first token
-            next_token = generator.get_next_tokens()[0]
-            if next_token == search_options["eos_token_id"]:
-                 logging.info("EOS token encountered.")
-                 break
             decoded_chunk = tokenizer.decode([next_token])
             token_count += 1
-            # Handle potential decoding issues or special tokens if necessary
-            if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
-                logging.info("Assistant explicitly generated <|end|> token.")
-                break
-            if decoded_chunk == tokenizer.eos_token: # Check against tokenizer's eos_token string
-                logging.info("Assistant generated EOS token string.")
                 break
             yield decoded_chunk # Yield just the text chunk
         end_time = time.time()
@@ -169,13 +162,18 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
         model_status = f"Error during generation: {e}"
         yield f"\n\nSorry, an error occurred during generation: {e}" # Yield error message
 # --- Gradio Interface Functions ---
 # 1. Function to add user message to chat history
 def add_user_message(user_message, history):
     """Adds the user's message to the chat history for display."""
     if not user_message:
-        raise gr.Error("Please enter a message.")
     history = history + [[user_message, None]] # Append user message, leave bot response None
     return "", history # Clear input textbox, return updated history
@@ -183,7 +181,8 @@ def add_user_message(user_message, history):
 def generate_bot_response(history, max_length, temperature, top_p, top_k):
     """Generates the bot's response based on the history and streams it."""
     if not history or history[-1][1] is not None:
-        # This shouldn't happen in the normal flow, but good practice
         return history
     user_prompt = history[-1][0] # Get the latest user prompt
@@ -196,7 +195,7 @@ def generate_bot_response(history, max_length, temperature, top_p, top_k):
     )
     # Stream the response chunks back to Gradio
-    history[-1][1] = "" # Initialize the bot response string
     for chunk in response_stream:
         history[-1][1] += chunk # Append the chunk to the bot's message in history
         yield history # Yield the *entire updated history* back to Chatbot
@@ -207,9 +206,9 @@ def clear_chat():
     global model_status # Keep model status indicator updated
     # Reset status only if it was showing an error from generation maybe?
     # Or just always reset to Ready if model is loaded.
-    if model and tokenizer:
          model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
-    # Keep the original error if init failed
     return None, [], model_status # Clear Textbox, Chatbot history, and update status display

     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
+        # FIX: Removed explicit DeviceType. Let the library infer or use string if needed by constructor.
         # The simple constructor often works by detecting the installed ORT package.
         logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
         model = og.Model(model_path) # Simplified model loading
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
     try:
         input_tokens = tokenizer.encode(full_prompt)
+        # FIX: Removed eos_token_id and pad_token_id as they are not attributes
+        # of onnxruntime_genai.Tokenizer and likely handled internally by the generator.
         search_options = {
             "max_length": max_length,
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "do_sample": True,
         }
         params = og.GeneratorParams(model)
         first_token_time = None
         token_count = 0
+        # Rely primarily on generator.is_done()
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
             if first_token_time is None:
                  first_token_time = time.time() # Record time to first token
+            next_token = generator.get_next_tokens()[0]
             decoded_chunk = tokenizer.decode([next_token])
             token_count += 1
+            # Secondary check: Stop if the model explicitly generates the <|end|> string literal.
+            if decoded_chunk == "<|end|>":
+                logging.info("Assistant explicitly generated <|end|> token string.")
                 break
             yield decoded_chunk # Yield just the text chunk
         end_time = time.time()
         model_status = f"Error during generation: {e}"
         yield f"\n\nSorry, an error occurred during generation: {e}" # Yield error message
 # --- Gradio Interface Functions ---
 # 1. Function to add user message to chat history
 def add_user_message(user_message, history):
     """Adds the user's message to the chat history for display."""
     if not user_message:
+        # Returning original history prevents adding empty message
+        # Use gr.Warning or gr.Info for user feedback? Or raise gr.Error?
+        # gr.Warning("Please enter a message.") # Shows warning toast
+        return "", history # Clear input, return unchanged history
+        # raise gr.Error("Please enter a message.") # Stops execution, shows error
     history = history + [[user_message, None]] # Append user message, leave bot response None
     return "", history # Clear input textbox, return updated history
 def generate_bot_response(history, max_length, temperature, top_p, top_k):
     """Generates the bot's response based on the history and streams it."""
     if not history or history[-1][1] is not None:
+        # This case means user submitted empty message or something went wrong
+        # No need to generate if the last turn isn't user's pending turn
         return history
     user_prompt = history[-1][0] # Get the latest user prompt
     )
     # Stream the response chunks back to Gradio
+    history[-1][1] = "" # Initialize the bot response string in the history
     for chunk in response_stream:
         history[-1][1] += chunk # Append the chunk to the bot's message in history
         yield history # Yield the *entire updated history* back to Chatbot
     global model_status # Keep model status indicator updated
     # Reset status only if it was showing an error from generation maybe?
     # Or just always reset to Ready if model is loaded.
+    if model and tokenizer and not model_status.startswith("Error") and not model_status.startswith("FATAL"):
          model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
+    # Keep the original error if init failed, otherwise show ready status
     return None, [], model_status # Clear Textbox, Chatbot history, and update status display