Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 23 days ago

Commit

eed5424

verified ·

1 Parent(s): 1dd4d6a

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -20

app.py CHANGED Viewed

@@ -1,20 +1,205 @@
-# Core Gradio UI library
-gradio>=4.0.0,<5.0.0
-# ONNX Runtime GenAI for inference (CPU version)
-# Use --pre as it's often in pre-release
-# NOTE: If targeting a GPU Space, change this to onnxruntime-genai-cuda
-# and update EXECUTION_PROVIDER in app.py to "cuda"
-onnxruntime-genai --pre
-# Hugging Face Hub for downloading models/files
-huggingface_hub>=0.20.0
-# ONNX Runtime itself (will be installed as a dependency of onnxruntime-genai,
-# but specifying can sometimes help resolve version conflicts if needed)
-# onnxruntime>=1.17.0 # Generally not needed to list explicitly
-# Git LFS is needed by huggingface_hub to download large model files.
-# It needs to be installed on the Space environment, which is usually handled
-# by the Hugging Face Spaces infrastructure if not using Docker.
-# If you encounter download issues, ensure git-lfs is available.

+import gradio as gr
+import onnxruntime_genai as og
+import time
+import os
+from huggingface_hub import snapshot_download
+import argparse
+import logging
+# --- Logging Setup ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# --- Configuration ---
+MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
+# --- Defaulting to CPU INT4 for Hugging Face Spaces ---
+# Free Spaces generally provide CPU resources.
+# If deploying on a paid GPU Space, you would change these
+# and the requirements.txt accordingly.
+EXECUTION_PROVIDER = "cpu"
+MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
+# Ensure requirements.txt lists: onnxruntime-genai
+# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
+# --- Alternative GPU Configuration (Requires GPU Space & requirements change) ---
+# EXECUTION_PROVIDER = "cuda"
+# MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
+# Ensure requirements.txt lists: onnxruntime-genai-cuda
+# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
+LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
+HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg" # Official HF Logo
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
+model_variant_name = os.path.basename(os.path.dirname(MODEL_VARIANT_GLOB)) # For display
+# --- Model Download and Load ---
+def initialize_model():
+    """Downloads and loads the ONNX model and tokenizer."""
+    global model, tokenizer
+    logging.info("--- Initializing ONNX Runtime GenAI ---")
+    # --- Download ---
+    model_variant_dir = os.path.join(LOCAL_MODEL_DIR, os.path.dirname(MODEL_VARIANT_GLOB))
+    if os.path.exists(model_variant_dir) and os.listdir(model_variant_dir):
+        logging.info(f"Model variant found in {model_variant_dir}. Skipping download.")
+        model_path = model_variant_dir
+    else:
+        logging.info(f"Downloading model variant '{MODEL_VARIANT_GLOB}' from {MODEL_REPO}...")
+        try:
+            # Use cache_dir for potentially faster re-runs if space storage allows caching
+            # cache_dir = os.path.join(LOCAL_MODEL_DIR, ".cache") # Optional: Define cache dir
+            snapshot_download(
+                MODEL_REPO,
+                allow_patterns=[MODEL_VARIANT_GLOB],
+                local_dir=LOCAL_MODEL_DIR,
+                local_dir_use_symlinks=False # Safest for cross-platform/Space compatibility
+                # cache_dir=cache_dir # Optional
+            )
+            model_path = model_variant_dir
+            logging.info(f"Model downloaded to: {model_path}")
+        except Exception as e:
+            logging.error(f"Error downloading model: {e}", exc_info=True)
+            logging.error("Please ensure the Space has internet access and necessary permissions.")
+            logging.error("Check Hugging Face Hub status if issues persist.")
+            # Optionally raise to stop the app, or try to proceed if partial files might exist
+            raise RuntimeError(f"Failed to download model: {e}")
+    # --- Load ---
+    logging.info(f"Loading model from: {model_path}")
+    logging.info(f"Using Execution Provider: {EXECUTION_PROVIDER.upper()}")
+    try:
+        og_device_type = getattr(og.DeviceType, EXECUTION_PROVIDER.upper(), og.DeviceType.CPU)
+        model = og.Model(model_path, og_device_type)
+        tokenizer = og.Tokenizer(model)
+        logging.info("Model and Tokenizer loaded successfully.")
+    except Exception as e:
+        logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
+        logging.error(f"Ensure the correct onnxruntime-genai package is installed (check requirements.txt) for {EXECUTION_PROVIDER}.")
+        logging.error("Verify model files integrity in '{model_path}'.")
+        raise RuntimeError(f"Failed to load model: {e}")
+# --- Generation Function ---
+def generate_response(prompt, history, max_length=1024, temperature=0.7, top_p=0.9, top_k=50):
+    """Generates a response using the Phi-4 ONNX model."""
+    if not model or not tokenizer:
+        return "Error: Model not initialized. Please check logs."
+    if not prompt:
+        return "Please enter a prompt."
+    # --- Prepare the prompt using the Phi-4 instruct format ---
+    # "<|user|>\n{user_message}<|end|>\n<|assistant|>\n{assistant_message}<|end|>"
+    full_prompt = ""
+    for user_msg, assistant_msg in history:
+        full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
+        if assistant_msg: # Add assistant message only if it exists
+             full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
+    # Add the current user prompt and the trigger for the assistant's response
+    full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+    logging.info(f"Generating response for prompt (last part): ...{prompt[-50:]}")
+    # logging.debug(f"Full Formatted Prompt:\n{full_prompt}") # Use debug level
+    try:
+        input_tokens = tokenizer.encode(full_prompt)
+        search_options = {
+            "max_length": max_length,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "do_sample": True, # Sampling is generally preferred for chat
+            "eos_token_id": tokenizer.eos_token_id, # Important for stopping generation
+            "pad_token_id": tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id, # Use EOS if PAD not explicit
+        }
+        params = og.GeneratorParams(model)
+        params.set_search_options(**search_options)
+        params.input_ids = input_tokens
+        start_time = time.time()
+        generator = og.Generator(model, params)
+        response_text = ""
+        logging.info("Streaming response...")
+        # Simple token streaming - yield partial results for Gradio
+        while not generator.is_done():
+            generator.compute_logits()
+            generator.generate_next_token()
+            next_token = generator.get_next_tokens()[0]
+            # Important: Check for EOS token ID to stop manually if needed
+            if next_token == search_options["eos_token_id"]:
+                 break
+            decoded_chunk = tokenizer.decode([next_token])
+            response_text += decoded_chunk
+            yield response_text # Yield intermediate results for streaming effect
+        end_time = time.time()
+        logging.info(f"Generation complete. Time taken: {end_time - start_time:.2f} seconds")
+        logging.info(f"Full Response (last 100 chars): ...{response_text[-100:]}")
+        # Final yield with the complete text (or return if not using yield in Gradio setup)
+        yield response_text.strip()
+    except Exception as e:
+        logging.error(f"Error during generation: {e}", exc_info=True)
+        yield f"Sorry, an error occurred during generation: {e}" # Yield error message
+# --- Initialize Model on App Start ---
+try:
+    initialize_model()
+except Exception as e:
+    print(f"FATAL: Model initialization failed: {e}")
+    # Optionally create a dummy Gradio interface showing the error
+    # Or just let the script exit/fail in the Space environment
+# --- Gradio Interface ---
+logging.info("Creating Gradio Interface...")
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"""
+    # Phi-4 Mini Instruct ONNX Demo
+    Powered by [`onnxruntime-genai`](https://github.com/microsoft/onnxruntime-genai) running the **{EXECUTION_PROVIDER.upper()}** INT4 ONNX Runtime version of [`{MODEL_REPO}`](https://huggingface.co/{MODEL_REPO}).
+    Model Variant: `{model_variant_name}`
+    <img src="{HF_LOGO_URL}" alt="Hugging Face Logo" style="display: inline-block; height: 1.5em; vertical-align: middle;"> This Space demonstrates running Phi-4 Mini efficiently with ONNX Runtime.
+    """)
+    chatbot = gr.Chatbot(label="Chat History", height=500, layout="bubble", bubble_full_width=False)
+    msg = gr.Textbox(
+        label="Your Prompt",
+        placeholder="<|user|>\nType your message here...<|end|>\n<|assistant|>",
+        lines=3,
+        info="Using the recommended Phi-4 instruct format." # Add info text
+        )
+    clear = gr.Button("Clear Chat")
+    with gr.Accordion("Generation Parameters", open=False):
+         max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Maximum number of tokens to generate.")
+         temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="Higher values (e.g., 0.8) make output more random, lower values (e.g., 0.2) make it more deterministic.")
+         top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P (Nucleus Sampling)", info="Filters vocabulary to the smallest set whose cumulative probability exceeds P. Set to 1.0 to disable.")
+         top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Filters vocabulary to the K most likely tokens. Set to 0 to disable.")
+    # Use the streaming capability of ChatInterface
+    # msg.submit returns a generator which updates chatbot gradually
+    msg.submit(
+        generate_response,
+        inputs=[msg, chatbot, max_length, temperature, top_p, top_k],
+        outputs=[chatbot]
+    )
+    # Connect the clear button
+    clear.click(lambda: (None, None), None, [msg, chatbot], queue=False) # Clear input and chatbot
+    gr.Markdown("Enter your prompt using the suggested format and press Enter. Adjust generation parameters in the accordion above.")
+logging.info("Launching Gradio App...")
+# Setting share=False is default and recommended for Spaces
+# queue() is important for handling multiple users
+# debug=True can be useful for local testing but should generally be False in production/Spaces
+demo.queue()
+demo.launch(show_error=True) # Show errors in the UI for easier debugging in Spaces