Spaces:

kimhyunwoo
/

freetestn

Sleeping

App Files Files Community

kimhyunwoo commited on 10 days ago

Commit

61bf4d9

verified ·

1 Parent(s): 6d73a79

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -74

app.py CHANGED Viewed

@@ -1,137 +1,209 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import gc # Garbage collector
 # --- Configuration ---
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
-MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU
-SYSTEM_PROMPT = "- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n- 사용자의 질문에 대해 친절하고 자세하게 답변해야 한다." # Simplified system prompt
 # --- Model Loading ---
-# Load model and tokenizer explicitly on CPU
-# Using device_map='cpu' and torch_dtype=torch.float32 ensures it targets the CPU
-print(f"Loading model: {MODEL_ID}...")
 try:
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float32, # Use float32 for CPU compatibility
-        device_map="cpu" # Explicitly load on CPU
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    print("Model and tokenizer loaded successfully on CPU.")
-    # Ensure model is in evaluation mode
-    model.eval()
 except Exception as e:
-    print(f"Error loading model: {e}")
-    # If loading fails, exit or handle gracefully
-    raise gr.Error(f"Failed to load the model {MODEL_ID}. Check logs. Error: {e}")
-# Define stop tokens based on the example, get their IDs
-stop_token_strings = ["<|endofturn|>", "<|stop|>"]
-stop_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
-# Also include the standard EOS token if it's different
-if tokenizer.eos_token_id not in stop_token_ids:
-    stop_token_ids.append(tokenizer.eos_token_id)
 # --- Inference Function ---
 def predict(message, history):
     """
     Generates a response using the HyperCLOVAX model based on user message and chat history.
     """
-    # Reconstruct chat history in the required format
     chat_history_formatted = [
-        {"role": "tool_list", "content": ""}, # As per model card example structure
-        {"role": "system", "content": SYSTEM_PROMPT}
     ]
     for user_msg, ai_msg in history:
         chat_history_formatted.append({"role": "user", "content": user_msg})
-        # Add the endofturn token manually if needed, or rely on template
-        chat_history_formatted.append({"role": "assistant", "content": ai_msg }) # Ensure assistant response is captured correctly
-    # Add the current user message
     chat_history_formatted.append({"role": "user", "content": message})
-    # Apply the chat template
     try:
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
-            add_generation_prompt=True, # Crucial for instruct models
             return_dict=True,
             return_tensors="pt"
-        ).to(model.device) # Ensure inputs are on the CPU device
-    except Exception as e:
-        print(f"Error applying chat template: {e}")
-        return f"Error formatting input: {e}"
-    print(f"Input tokens: {inputs['input_ids'].shape[1]}") # Log input length
-    # Generate response - Use torch.no_grad() to save memory
     try:
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
-                eos_token_id=stop_token_ids, # Use the identified stop token IDs
-                pad_token_id=tokenizer.eos_token_id, # Often set pad = eos
-                do_sample=True, # Sample for more diverse outputs
-                temperature=0.7,
-                top_p=0.9,
-                # Note: The original example used 'stop_strings' and passed the tokenizer,
-                # which isn't standard in `generate`. Using eos_token_id is preferred.
-                # If specific stopping behavior is needed beyond EOS, StoppingCriteria can be used.
             )
     except Exception as e:
-        print(f"Error during model generation: {e}")
-        # Clean up GPU memory if an error occurs (though we are on CPU, good practice)
         gc.collect()
-        # Consider torch.cuda.empty_cache() if GPU was accidentally involved
-        return f"Error during generation: {e}"
-    # Decode only the newly generated tokens
-    input_length = inputs['input_ids'].shape[1]
     new_tokens = output_ids[0, input_length:]
     response = tokenizer.decode(new_tokens, skip_special_tokens=True)
-    print(f"Output tokens: {len(new_tokens)}") # Log output length
-    print(f"Raw response: {response}")
-    # Clean up memory (especially important in constrained environments)
     del inputs
     del output_ids
-    gc.collect()
-    # Consider torch.cuda.empty_cache() if GPU was involved
     return response
 # --- Gradio Interface ---
-# Use ChatInterface for a conversational experience
-chatbot = gr.Chatbot(label="HyperCLOVA X SEED (0.5B)", height=600)
 demo = gr.ChatInterface(
-    fn=predict,
-    chatbot=chatbot,
-    title="🇰🇷 HyperCLOVA X SEED (0.5B) - CPU Demo",
     description=(
-        f"Chat with the `naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B` model. "
-        f"This model excels in Korean language understanding. \n"
-        f"**Note:** Running on a free CPU Hugging Face Space. Responses will be **slow**. "
-        f"Max new tokens set to {MAX_NEW_TOKENS}."
     ),
-    examples=[
-        ["슈뢰딩거 방정식과 양자역학의 관계를 최대한 자세히 알려줘."],
-        ["네이버 하이퍼클로바X에 대해 설명해줘."],
-        ["오늘 날씨 어때?"], # Example showing it might not know real-time data
-        ["간단한 파이썬 코드 예제를 보여줘."],
-    ],
-    cache_examples=False, # Caching might consume too much memory/disk on free tier
-    theme="soft",
-    retry_btn=None,
-    undo_btn="Delete Previous Turn",
-    clear_btn="Clear Conversation",
 )
 # --- Launch the App ---
 if __name__ == "__main__":
-    # queue() is important for handling multiple users, especially with slow inference
-    demo.queue().launch()

 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+import gc
+import os
+import datetime
 # --- Configuration ---
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
+MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU (adjust as needed)
+CPU_THREAD_COUNT = 4 # Limit threads torch uses on CPU if needed (adjust based on Space CPU core count)
+# Set PyTorch CPU thread count (optional, might help prevent resource exhaustion)
+# torch.set_num_threads(CPU_THREAD_COUNT)
+# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
+# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
+print("--- Environment Setup ---")
+print(f"PyTorch version: {torch.__version__}")
+print(f"Running on device: cpu") # Explicitly state we expect CPU
+print(f"Torch Threads: {torch.get_num_threads()}") # Check default threads
 # --- Model Loading ---
+print(f"--- Loading Model: {MODEL_ID} ---")
+print("This might take a few minutes, especially on the first launch...")
 try:
+    # Load model explicitly onto CPU with float32 (standard for CPU compatibility)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float32, # Use float32 for CPU compatibility
+        device_map="cpu"          # Explicitly map to CPU
+        # low_cpu_mem_usage=True # Can sometimes help on low RAM, but might slow down loading
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model.eval() # Set model to evaluation mode
+    print("--- Model and Tokenizer Loaded Successfully on CPU ---")
+    # --- Stop Token Configuration ---
+    # Get IDs for specified stop tokens and the standard EOS token
+    stop_token_strings = ["<|endofturn|>", "<|stop|>"]
+    stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
+    # Ensure the official EOS token is also included if not already present
+    if tokenizer.eos_token_id not in stop_token_ids_list:
+        stop_token_ids_list.append(tokenizer.eos_token_id)
+    # Remove None values if any token wasn't found (though they should be in this vocab)
+    stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
+    if not stop_token_ids_list:
+        print("Warning: Could not find any stop token IDs. Using default EOS only.")
+        stop_token_ids_list = [tokenizer.eos_token_id]
+    print(f"Using Stop Token IDs: {stop_token_ids_list}")
 except Exception as e:
+    print(f"!!! Error loading model: {e}")
+    # Clean up memory if partial loading occurred
+    del model
+    del tokenizer
+    gc.collect()
+    # Raise a Gradio error to make it visible in the UI
+    raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
+# --- System Prompt ---
+# Use a dynamic date and the correct model name as per the card example structure
+def get_system_prompt():
+    # current_date = datetime.datetime.now().strftime("%Y년 %m월 %d일(%a)") # Korean date format
+    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") # English date format is safer for consistency
+    return (
+        f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
+        # f"- 오늘은 {current_date}이다.\n" # Dynamic date can be added if desired
+        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
+    )
 # --- Inference Function ---
 def predict(message, history):
     """
     Generates a response using the HyperCLOVAX model based on user message and chat history.
+    Handles chat formatting, generation, decoding, and memory management.
     """
+    system_prompt = get_system_prompt()
+    # 1. Format conversation history according to the model's expected template
     chat_history_formatted = [
+        {"role": "tool_list", "content": ""}, # Required by the model card example
+        {"role": "system", "content": system_prompt}
     ]
     for user_msg, ai_msg in history:
         chat_history_formatted.append({"role": "user", "content": user_msg})
+        # Ensure assistant response is included correctly, potentially adding endofturn if needed by template logic,
+        # but apply_chat_template usually handles this.
+        chat_history_formatted.append({"role": "assistant", "content": ai_msg}) # Append the actual AI response
+    # Add the latest user message
     chat_history_formatted.append({"role": "user", "content": message})
+    # 2. Apply the chat template
     try:
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
+            add_generation_prompt=True, # Crucial for instruction-following models
             return_dict=True,
             return_tensors="pt"
+        ).to(model.device) # Ensure inputs are on the correct device (CPU)
+        input_length = inputs['input_ids'].shape[1]
+        print(f"\nInput tokens: {input_length}")
+        # print(f"Formatted input text (approx): {tokenizer.decode(inputs['input_ids'][0])}") # For debugging
+    except Exception as e:
+        print(f"!!! Error applying chat template: {e}")
+        # Provide feedback to the user
+        return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
+    # 3. Generate response using the model
+    output_ids = None # Initialize output_ids
     try:
+        print("Generating response...")
+        # Use torch.no_grad() to reduce memory footprint during inference
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
+                eos_token_id=stop_token_ids_list, # Use the list of stop token IDs
+                pad_token_id=tokenizer.eos_token_id, # Set pad token ID to EOS token ID
+                do_sample=True,    # Enable sampling for less repetitive output
+                temperature=0.7,   # Control randomness (lower = more focused)
+                top_p=0.9,         # Use nucleus sampling
+                # num_beams=1,     # Use 1 for sampling (greedy is default if do_sample=False)
+                # early_stopping=True # Stop generation early if EOS is reached
             )
+        print("Generation complete.")
     except Exception as e:
+        print(f"!!! Error during model generation: {e}")
+        # Clean up potentially large tensors in case of error
+        del inputs
+        if output_ids is not None: del output_ids
         gc.collect()
+        return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
+    # 4. Decode the response
+    # We need to decode only the newly generated tokens, excluding the input tokens
     new_tokens = output_ids[0, input_length:]
     response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    print(f"Output tokens: {len(new_tokens)}")
+    # print(f"Raw response: '{response}'") # Log the raw decoded output
+    # 5. Clean up memory
     del inputs
     del output_ids
+    del new_tokens
+    gc.collect() # Explicitly run garbage collection
+    print("Memory cleaned.")
     return response
 # --- Gradio Interface ---
+print("--- Setting up Gradio Interface ---")
+# Use ChatInterface for a user-friendly chat experience
+chatbot_component = gr.Chatbot(
+    label="HyperCLOVA X SEED (0.5B) 대화",
+    bubble_full_width=False,
+    height=600
+    )
+# Define examples relevant to the model's strengths (Korean)
+examples = [
+    ["네이버 클로바X는 무엇인가요?"],
+    ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
+    ["딥러닝 모델 학습 과정을 단계별로 알려줘."],
+    ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
+]
+# Create the Gradio ChatInterface
 demo = gr.ChatInterface(
+    fn=predict,                 # The function to call for generating responses
+    chatbot=chatbot_component,  # The chatbot display component
+    title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
+        f"**모델:** {MODEL_ID}\n"
+        f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
+        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다 (특히 첫 응답). "
+        f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
+    examples=examples,
+    cache_examples=False,      # Disable caching on free tier to save disk/memory
+    theme="soft",              # Use a soft theme
+    retry_btn="다시 시도",
+    undo_btn="이전 턴 삭제",
+    clear_btn="대화 초기화",
 )
 # --- Launch the App ---
 if __name__ == "__main__":
+    print("--- Launching Gradio App ---")
+    # queue() is important for handling multiple users, especially with slow inference on CPU
+    # Use concurrency_count=1 if resource exhaustion occurs, otherwise default might be okay.
+    demo.queue(
+        # default_concurrency_limit=1 # Limit concurrent requests if needed
+        ).launch(
+            # share=False # Set to True to get a public link (requires login)
+            # server_name="0.0.0.0" # To make it accessible on the network if running locally
+            )