Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 22 days ago

Commit

513f7a6

verified ·

1 Parent(s): 7fd9f7d

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -78

app.py CHANGED Viewed

@@ -13,33 +13,35 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
 # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
-# Free Spaces generally provide CPU resources.
-# If deploying on a paid GPU Space, you would change these
-# and the requirements.txt accordingly.
 EXECUTION_PROVIDER = "cpu"
 MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
 # Ensure requirements.txt lists: onnxruntime-genai
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
-# --- Alternative GPU Configuration (Requires GPU Space & requirements change) ---
 # EXECUTION_PROVIDER = "cuda"
 # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
 # Ensure requirements.txt lists: onnxruntime-genai-cuda
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
-HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg" # Official HF Logo
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
 model_variant_name = os.path.basename(os.path.dirname(MODEL_VARIANT_GLOB)) # For display
 # --- Model Download and Load ---
 def initialize_model():
     """Downloads and loads the ONNX model and tokenizer."""
-    global model, tokenizer
     logging.info("--- Initializing ONNX Runtime GenAI ---")
     # --- Download ---
     model_variant_dir = os.path.join(LOCAL_MODEL_DIR, os.path.dirname(MODEL_VARIANT_GLOB))
@@ -49,59 +51,62 @@ def initialize_model():
     else:
         logging.info(f"Downloading model variant '{MODEL_VARIANT_GLOB}' from {MODEL_REPO}...")
         try:
-            # Use cache_dir for potentially faster re-runs if space storage allows caching
-            # cache_dir = os.path.join(LOCAL_MODEL_DIR, ".cache") # Optional: Define cache dir
             snapshot_download(
                 MODEL_REPO,
                 allow_patterns=[MODEL_VARIANT_GLOB],
                 local_dir=LOCAL_MODEL_DIR,
-                local_dir_use_symlinks=False # Safest for cross-platform/Space compatibility
-                # cache_dir=cache_dir # Optional
             )
             model_path = model_variant_dir
             logging.info(f"Model downloaded to: {model_path}")
         except Exception as e:
             logging.error(f"Error downloading model: {e}", exc_info=True)
-            logging.error("Please ensure the Space has internet access and necessary permissions.")
-            logging.error("Check Hugging Face Hub status if issues persist.")
-            # Optionally raise to stop the app, or try to proceed if partial files might exist
             raise RuntimeError(f"Failed to download model: {e}")
     # --- Load ---
-    logging.info(f"Loading model from: {model_path}")
-    logging.info(f"Using Execution Provider: {EXECUTION_PROVIDER.upper()}")
     try:
-        og_device_type = getattr(og.DeviceType, EXECUTION_PROVIDER.upper(), og.DeviceType.CPU)
         model = og.Model(model_path, og_device_type)
         tokenizer = og.Tokenizer(model)
         logging.info("Model and Tokenizer loaded successfully.")
     except Exception as e:
         logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
-        logging.error(f"Ensure the correct onnxruntime-genai package is installed (check requirements.txt) for {EXECUTION_PROVIDER}.")
-        logging.error("Verify model files integrity in '{model_path}'.")
         raise RuntimeError(f"Failed to load model: {e}")
 # --- Generation Function ---
-def generate_response(prompt, history, max_length=1024, temperature=0.7, top_p=0.9, top_k=50):
-    """Generates a response using the Phi-4 ONNX model."""
     if not model or not tokenizer:
-        return "Error: Model not initialized. Please check logs."
     if not prompt:
-        return "Please enter a prompt."
     # --- Prepare the prompt using the Phi-4 instruct format ---
-    # "<|user|>\n{user_message}<|end|>\n<|assistant|>\n{assistant_message}<|end|>"
     full_prompt = ""
     for user_msg, assistant_msg in history:
         full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
-        if assistant_msg: # Add assistant message only if it exists
              full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
-    # Add the current user prompt and the trigger for the assistant's response
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-    logging.info(f"Generating response for prompt (last part): ...{prompt[-50:]}")
-    # logging.debug(f"Full Formatted Prompt:\n{full_prompt}") # Use debug level
     try:
         input_tokens = tokenizer.encode(full_prompt)
@@ -111,9 +116,9 @@ def generate_response(prompt, history, max_length=1024, temperature=0.7, top_p=0
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
-            "do_sample": True, # Sampling is generally preferred for chat
-            "eos_token_id": tokenizer.eos_token_id, # Important for stopping generation
-            "pad_token_id": tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id, # Use EOS if PAD not explicit
         }
         params = og.GeneratorParams(model)
@@ -123,83 +128,155 @@ def generate_response(prompt, history, max_length=1024, temperature=0.7, top_p=0
         start_time = time.time()
         generator = og.Generator(model, params)
         response_text = ""
         logging.info("Streaming response...")
-        # Simple token streaming - yield partial results for Gradio
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
             next_token = generator.get_next_tokens()[0]
-            # Important: Check for EOS token ID to stop manually if needed
             if next_token == search_options["eos_token_id"]:
                  break
             decoded_chunk = tokenizer.decode([next_token])
             response_text += decoded_chunk
             yield response_text # Yield intermediate results for streaming effect
         end_time = time.time()
-        logging.info(f"Generation complete. Time taken: {end_time - start_time:.2f} seconds")
-        logging.info(f"Full Response (last 100 chars): ...{response_text[-100:]}")
-        # Final yield with the complete text (or return if not using yield in Gradio setup)
         yield response_text.strip()
     except Exception as e:
         logging.error(f"Error during generation: {e}", exc_info=True)
-        yield f"Sorry, an error occurred during generation: {e}" # Yield error message
 # --- Initialize Model on App Start ---
 try:
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
-    # Optionally create a dummy Gradio interface showing the error
-    # Or just let the script exit/fail in the Space environment
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # Phi-4 Mini Instruct ONNX Demo
-    Powered by [`onnxruntime-genai`](https://github.com/microsoft/onnxruntime-genai) running the **{EXECUTION_PROVIDER.upper()}** INT4 ONNX Runtime version of [`{MODEL_REPO}`](https://huggingface.co/{MODEL_REPO}).
-    Model Variant: `{model_variant_name}`
-    <img src="{HF_LOGO_URL}" alt="Hugging Face Logo" style="display: inline-block; height: 1.5em; vertical-align: middle;"> This Space demonstrates running Phi-4 Mini efficiently with ONNX Runtime.
-    """)
-    chatbot = gr.Chatbot(label="Chat History", height=500, layout="bubble", bubble_full_width=False)
-    msg = gr.Textbox(
-        label="Your Prompt",
-        placeholder="<|user|>\nType your message here...<|end|>\n<|assistant|>",
-        lines=3,
-        info="Using the recommended Phi-4 instruct format." # Add info text
-        )
-    clear = gr.Button("Clear Chat")
-    with gr.Accordion("Generation Parameters", open=False):
-         max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Maximum number of tokens to generate.")
-         temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="Higher values (e.g., 0.8) make output more random, lower values (e.g., 0.2) make it more deterministic.")
-         top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P (Nucleus Sampling)", info="Filters vocabulary to the smallest set whose cumulative probability exceeds P. Set to 1.0 to disable.")
-         top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Filters vocabulary to the K most likely tokens. Set to 0 to disable.")
-    # Use the streaming capability of ChatInterface
-    # msg.submit returns a generator which updates chatbot gradually
-    msg.submit(
-        generate_response,
-        inputs=[msg, chatbot, max_length, temperature, top_p, top_k],
-        outputs=[chatbot]
     )
-    # Connect the clear button
-    clear.click(lambda: (None, None), None, [msg, chatbot], queue=False) # Clear input and chatbot
-    gr.Markdown("Enter your prompt using the suggested format and press Enter. Adjust generation parameters in the accordion above.")
 logging.info("Launching Gradio App...")
-# Setting share=False is default and recommended for Spaces
-# queue() is important for handling multiple users
-# debug=True can be useful for local testing but should generally be False in production/Spaces
-demo.queue()
-demo.launch(show_error=True) # Show errors in the UI for easier debugging in Spaces

 MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
 # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
 EXECUTION_PROVIDER = "cpu"
 MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
 # Ensure requirements.txt lists: onnxruntime-genai
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
+# --- (Optional) Alternative GPU Configuration ---
 # EXECUTION_PROVIDER = "cuda"
 # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
 # Ensure requirements.txt lists: onnxruntime-genai-cuda
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
+HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
+HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
+ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
 model_variant_name = os.path.basename(os.path.dirname(MODEL_VARIANT_GLOB)) # For display
+model_status = "Initializing..."
 # --- Model Download and Load ---
 def initialize_model():
     """Downloads and loads the ONNX model and tokenizer."""
+    global model, tokenizer, model_status
     logging.info("--- Initializing ONNX Runtime GenAI ---")
+    model_status = "Downloading model..."
+    logging.info(model_status)
     # --- Download ---
     model_variant_dir = os.path.join(LOCAL_MODEL_DIR, os.path.dirname(MODEL_VARIANT_GLOB))
     else:
         logging.info(f"Downloading model variant '{MODEL_VARIANT_GLOB}' from {MODEL_REPO}...")
         try:
             snapshot_download(
                 MODEL_REPO,
                 allow_patterns=[MODEL_VARIANT_GLOB],
                 local_dir=LOCAL_MODEL_DIR,
+                local_dir_use_symlinks=False
             )
             model_path = model_variant_dir
             logging.info(f"Model downloaded to: {model_path}")
         except Exception as e:
             logging.error(f"Error downloading model: {e}", exc_info=True)
+            model_status = f"Error downloading model: {e}"
             raise RuntimeError(f"Failed to download model: {e}")
     # --- Load ---
+    model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
+    logging.info(model_status)
     try:
+        # Determine device type based on execution provider string
+        if EXECUTION_PROVIDER.lower() == "cuda":
+            og_device_type = og.DeviceType.CUDA
+        elif EXECUTION_PROVIDER.lower() == "dml":
+             og_device_type = og.DeviceType.DML # Requires onnxruntime-genai-directml
+        else: # Default to CPU
+            og_device_type = og.DeviceType.CPU
         model = og.Model(model_path, og_device_type)
         tokenizer = og.Tokenizer(model)
+        model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
         logging.info("Model and Tokenizer loaded successfully.")
     except Exception as e:
         logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
+        model_status = f"Error loading model: {e}"
         raise RuntimeError(f"Failed to load model: {e}")
 # --- Generation Function ---
+def generate_response(prompt, history, max_length, temperature, top_p, top_k):
+    """Generates a response using the Phi-4 ONNX model, yielding partial results."""
+    global model_status
     if not model or not tokenizer:
+        model_status = "Error: Model not initialized!"
+        yield "Error: Model not initialized. Please check logs."
+        return
     if not prompt:
+        yield "Please enter a prompt."
+        return
     # --- Prepare the prompt using the Phi-4 instruct format ---
     full_prompt = ""
     for user_msg, assistant_msg in history:
         full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
+        if assistant_msg:
              full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+    logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
+    # logging.debug(f"Full Formatted Prompt:\n{full_prompt}")
     try:
         input_tokens = tokenizer.encode(full_prompt)
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
+            "do_sample": True,
+            "eos_token_id": tokenizer.eos_token_id,
+            "pad_token_id": tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
         }
         params = og.GeneratorParams(model)
         start_time = time.time()
         generator = og.Generator(model, params)
         response_text = ""
+        model_status = "Generating..." # Update status indicator
         logging.info("Streaming response...")
+        first_token_time = None
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
+            if first_token_time is None:
+                 first_token_time = time.time() # Record time to first token
             next_token = generator.get_next_tokens()[0]
             if next_token == search_options["eos_token_id"]:
+                 logging.info("EOS token encountered.")
                  break
             decoded_chunk = tokenizer.decode([next_token])
+            # Handle potential decoding issues or special tokens if necessary
+            # (e.g., some models might output "<|end|>" which you might want to strip)
+            if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
+                logging.info("Assistant explicitly generated <|end|> token.")
+                break
             response_text += decoded_chunk
             yield response_text # Yield intermediate results for streaming effect
         end_time = time.time()
+        ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
+        total_time = end_time - start_time
+        token_count = len(tokenizer.decode(generator.get_output_sequences()[0])) # Approx token count
+        tps = (token_count / total_time) if total_time > 0 else 0
+        logging.info(f"Generation complete. Tokens: ~{token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
+        model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
+        # Final yield with the complete text
         yield response_text.strip()
     except Exception as e:
         logging.error(f"Error during generation: {e}", exc_info=True)
+        model_status = f"Error during generation: {e}"
+        yield f"Sorry, an error occurred during generation: {e}"
+# --- Clear Chat Function ---
+def clear_chat():
+    return None, None # Clears Textbox and Chatbot
 # --- Initialize Model on App Start ---
+# Wrap in try-except to allow Gradio UI to potentially load even if model fails
 try:
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
+    model_status = f"FATAL ERROR during init: {e}"
+    # The UI will still load, but generation will fail. The status will show the error.
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
+# Select a theme
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="sky",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set(
+    # Customize specific component styles if needed
+    # button_primary_background_fill="*primary_500",
+    # button_primary_background_fill_hover="*primary_400",
+)
+with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
+    # Header Section
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=3):
+            gr.Markdown(f"""
+            # Phi-4 Mini Instruct ONNX Chat 🤖
+            Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
+            running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}).
+            """)
+        with gr.Column(scale=1, min_width=150):
+             gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
+             model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
+    # Main Layout (Chat on Left, Settings on Right)
+    with gr.Row():
+        # Chat Column
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(
+                label="Conversation",
+                height=600,
+                layout="bubble",
+                bubble_full_width=False,
+                avatar_images=(None, "https://microsoft.github.io/phi/assets/img/logo-final.png") # (user, bot) - Optional: Add user avatar path/URL if desired
+            )
+            with gr.Row():
+                 prompt_input = gr.Textbox(
+                    label="Your Message",
+                    placeholder="<|user|>\nType your message here...\n<|end|>",
+                    lines=4,
+                    scale=9 # Make textbox wider
+                 )
+                 submit_button = gr.Button("Send", variant="primary", scale=1, min_width=120) # Primary send button
+                 clear_button = gr.Button("🗑️ Clear", variant="secondary", scale=1, min_width=120) # Secondary clear button
+        # Settings Column
+        with gr.Column(scale=1, min_width=250):
+            gr.Markdown("### ⚙️ Generation Settings")
+            with gr.Group(): # Group settings visually
+                max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Max tokens in response.")
+                temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="0.0 = deterministic\n>1.0 = more random")
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P", info="Nucleus sampling probability.")
+                top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Limit to K most likely tokens (0=disable).")
+            gr.Markdown("---") # Separator
+            gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
+    # Event Listeners (Connecting UI components to functions)
+    # Define reusable inputs list for generation
+    gen_inputs = [prompt_input, chatbot, max_length, temperature, top_p, top_k]
+    # Submit action (using streaming yields from generate_response)
+    submit_button.click(
+        fn=generate_response,
+        inputs=gen_inputs,
+        outputs=[chatbot], # Output directly streams to chatbot
+        queue=True # Enable queuing
+    )
+    # Allow submitting via Enter key in the textbox as well
+    prompt_input.submit(
+        fn=generate_response,
+        inputs=gen_inputs,
+        outputs=[chatbot],
+        queue=True
     )
+    # Clear button action
+    clear_button.click(
+        fn=clear_chat,
+        inputs=None,
+        outputs=[prompt_input, chatbot], # Clear both input and chat history
+        queue=False # No need to queue clearing
+    )
+# Launch the Gradio app
 logging.info("Launching Gradio App...")
+demo.queue() # Enable queuing for handling concurrent users/requests
+demo.launch(show_error=True, max_threads=40) # show_error=True helps debug in Spaces