Spaces:

kimhyunwoo
/

gemma-3-1b-it-space

Running

App Files Files Community

kimhyunwoo commited on Mar 12

Commit

c5ec987

verified ·

1 Parent(s): a41650d

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -76

app.py CHANGED Viewed

@@ -4,133 +4,115 @@ import torch
 import os
 import gradio as gr
-# --- 1. Authentication (Using Environment Variable - the ONLY correct way for Spaces) ---
-# Hugging Face Spaces CANNOT use interactive login.  You MUST use an environment variable.
-# 1. Go to your Space's settings.
-# 2. Click on "Repository Secrets".
-# 3. Click "New Secret".
-# 4. Name the secret: HUGGING_FACE_HUB_TOKEN
-# 5. Paste your Hugging Face API token (with read access) as the value.
-# 6. Save the secret.
-# The login() call below will now automatically use the environment variable.
-login()
-# --- 2. Model and Tokenizer Setup (with comprehensive error handling) ---
 def load_model_and_tokenizer(model_name="google/gemma-3-1b-it"):
-    """Loads the model and tokenizer, handling potential errors."""
     try:
-        # Suppress unnecessary warning messages from transformers
         logging.set_verbosity_error()
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            device_map="auto",  # Automatically use GPU if available, else CPU
-            torch_dtype=torch.bfloat16,  # Use bfloat16 for speed/memory if supported
-            attn_implementation="flash_attention_2"  # Use Flash Attention 2 if supported
         )
         return model, tokenizer
     except Exception as e:
-        print(f"ERROR: Failed to load model or tokenizer: {e}")
-        print("\nTroubleshooting Steps:")
-        print("1. Ensure you have a Hugging Face account and have accepted the model's terms.")
-        print("2. Verify your internet connection.")
-        print("3. Double-check the model name: 'google/gemma-3-1b-it'")
-        print("4. Ensure you are properly authenticated using a Repository Secret (see above).")
-        print("5. If using a GPU, ensure your CUDA drivers and PyTorch are correctly installed.")
-        # Instead of exiting, raise the exception to be caught by Gradio
-        raise
-model, tokenizer = load_model_and_tokenizer()
-# --- 3. Chat Template Function (CRITICAL for conversational models) ---
 def apply_chat_template(messages, tokenizer):
-    """Applies the appropriate chat template."""
     try:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
             return tokenizer.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
         else:
-            print("WARNING: Tokenizer does not have a defined chat_template. Using a fallback.")
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
                             "{% endfor %}" \
                             "{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}"
             return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, chat_template=chat_template)
     except Exception as e:
-        print(f"ERROR: Failed to apply chat template: {e}")
-        raise # Re-raise to be caught by Gradio
 # --- 4. Text Generation Function ---
 def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
     """Generates a response."""
     prompt = apply_chat_template(messages, tokenizer)
     try:
         pipeline_instance = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
             model_kwargs={"attn_implementation": "flash_attention_2"}
-            )
         outputs = pipeline_instance(
-            prompt,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            pad_token_id=tokenizer.eos_token_id,
         )
-        generated_text = outputs[0]["generated_text"][len(prompt):].strip()
-        return generated_text
-    except Exception as e:
-        print(f"ERROR: Failed to generate response: {e}")
-        raise # Re-raise the exception
-# --- 5. Gradio Interface ---
-def predict(message, history):
     if not history:
         history = []
-    messages = []
-    for user_msg, bot_response in history:
-        messages.append({"role": "user", "content": user_msg})
-        if bot_response:  # Check if bot_response is not None
-            messages.append({"role": "model", "content": bot_response})
     messages.append({"role": "user", "content": message})
     try:
-      response = generate_response(messages, model, tokenizer)
-      history.append((message, response))
-      return "", history
     except Exception as e:
-        # Catch any exceptions during generation and display in the UI
-        return f"Error: {e}", history
 with gr.Blocks() as demo:
-    chatbot = gr.Chatbot(label="Gemma Chatbot", height=500)
-    msg = gr.Textbox(placeholder="Ask me anything!", container=False, scale=7)
-    clear = gr.ClearButton([msg, chatbot])
-    msg.submit(predict, [msg, chatbot], [msg, chatbot])
 demo.launch()

 import os
 import gradio as gr
+# --- 1. Authentication (Using User-Provided Token) ---
+def authenticate(token):
+    """Attempts to authenticate with the provided token."""
+    try:
+        login(token=token)
+        return True
+    except Exception as e:
+        print(f"Authentication failed: {e}")
+        return False
+# --- 2. Model and Tokenizer Setup ---
 def load_model_and_tokenizer(model_name="google/gemma-3-1b-it"):
+    """Loads the model and tokenizer."""
     try:
         logging.set_verbosity_error()
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2"
         )
         return model, tokenizer
     except Exception as e:
+        print(f"ERROR: Failed to load model/tokenizer: {e}")
+        raise  # Re-raise for Gradio
+# --- 3. Chat Template Function ---
 def apply_chat_template(messages, tokenizer):
+    """Applies the chat template."""
     try:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
             return tokenizer.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
         else:
+            print("WARNING: Tokenizer lacks chat_template. Using fallback.")
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
                             "{% endfor %}" \
                             "{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}"
             return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, chat_template=chat_template)
     except Exception as e:
+        print(f"ERROR: Chat template application failed: {e}")
+        raise
 # --- 4. Text Generation Function ---
 def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
     """Generates a response."""
     prompt = apply_chat_template(messages, tokenizer)
     try:
         pipeline_instance = pipeline(
+            "text-generation", model=model, tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16, device_map="auto",
             model_kwargs={"attn_implementation": "flash_attention_2"}
+        )
         outputs = pipeline_instance(
+            prompt, max_new_tokens=max_new_tokens, do_sample=True,
+            temperature=temperature, top_k=top_k, top_p=top_p,
+            repetition_penalty=repetition_penalty, pad_token_id=tokenizer.eos_token_id
         )
+        return outputs[0]["generated_text"][len(prompt):].strip()
+    except Exception as e:
+        print(f"ERROR: Response generation failed: {e}")
+        raise
+# --- 5. Gradio Interface ---
+model = None  # Initialize model and tokenizer as global variables
+tokenizer = None
+def chat(token, message, history):
+    global model, tokenizer  # Access the global model and tokenizer
+    if not authenticate(token):
+        return "Authentication failed. Please enter a valid Hugging Face token.", history
+    if model is None or tokenizer is None:
+        try:
+            model, tokenizer = load_model_and_tokenizer()
+        except Exception as e:
+            return f"Model loading error: {e}", history
     if not history:
         history = []
+    messages = [{"role": "user", "content": msg} for msg, _ in history]
+    messages.extend([{"role": "model", "content": resp} for _, resp in history if resp])
     messages.append({"role": "user", "content": message})
     try:
+        response = generate_response(messages, model, tokenizer)
+        history.append((message, response))
+        return "", history
     except Exception as e:
+        return f"Error during generation: {e}", history
 with gr.Blocks() as demo:
+    gr.Markdown("# Gemma Chatbot")
+    gr.Markdown("Enter your Hugging Face API token (read access required):")
+    token_input = gr.Textbox(label="Hugging Face Token", type="password") # Use type="password"
+    chatbot = gr.Chatbot(label="Chat", height=400)
+    msg_input = gr.Textbox(label="Message", placeholder="Ask me anything!")
+    clear_btn = gr.ClearButton([msg_input, chatbot])
+    msg_input.submit(chat, [token_input, msg_input, chatbot], [msg_input, chatbot])
+    clear_btn.click(lambda: (None, []), [], [msg_input, chatbot])
 demo.launch()