kimhyunwoo commited on
Commit
61bf4d9
Β·
verified Β·
1 Parent(s): 6d73a79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -74
app.py CHANGED
@@ -1,137 +1,209 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
- import gc # Garbage collector
 
 
5
 
6
  # --- Configuration ---
7
  MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
8
- MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU
9
- SYSTEM_PROMPT = "- AI μ–Έμ–΄λͺ¨λΈμ˜ 이름은 \"CLOVA X\" 이며 λ„€μ΄λ²„μ—μ„œ λ§Œλ“€μ—ˆλ‹€.\n- μ‚¬μš©μžμ˜ μ§ˆλ¬Έμ— λŒ€ν•΄ μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ λ‹΅λ³€ν•΄μ•Ό ν•œλ‹€." # Simplified system prompt
 
 
 
 
 
 
 
 
 
 
10
 
11
  # --- Model Loading ---
12
- # Load model and tokenizer explicitly on CPU
13
- # Using device_map='cpu' and torch_dtype=torch.float32 ensures it targets the CPU
14
- print(f"Loading model: {MODEL_ID}...")
15
  try:
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
  MODEL_ID,
18
  torch_dtype=torch.float32, # Use float32 for CPU compatibility
19
- device_map="cpu" # Explicitly load on CPU
 
20
  )
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
22
- print("Model and tokenizer loaded successfully on CPU.")
23
- # Ensure model is in evaluation mode
24
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  except Exception as e:
26
- print(f"Error loading model: {e}")
27
- # If loading fails, exit or handle gracefully
28
- raise gr.Error(f"Failed to load the model {MODEL_ID}. Check logs. Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Define stop tokens based on the example, get their IDs
31
- stop_token_strings = ["<|endofturn|>", "<|stop|>"]
32
- stop_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
33
- # Also include the standard EOS token if it's different
34
- if tokenizer.eos_token_id not in stop_token_ids:
35
- stop_token_ids.append(tokenizer.eos_token_id)
36
 
37
  # --- Inference Function ---
38
  def predict(message, history):
39
  """
40
  Generates a response using the HyperCLOVAX model based on user message and chat history.
 
41
  """
42
- # Reconstruct chat history in the required format
 
 
43
  chat_history_formatted = [
44
- {"role": "tool_list", "content": ""}, # As per model card example structure
45
- {"role": "system", "content": SYSTEM_PROMPT}
46
  ]
47
  for user_msg, ai_msg in history:
48
  chat_history_formatted.append({"role": "user", "content": user_msg})
49
- # Add the endofturn token manually if needed, or rely on template
50
- chat_history_formatted.append({"role": "assistant", "content": ai_msg }) # Ensure assistant response is captured correctly
 
51
 
52
- # Add the current user message
53
  chat_history_formatted.append({"role": "user", "content": message})
54
 
55
- # Apply the chat template
56
  try:
57
  inputs = tokenizer.apply_chat_template(
58
  chat_history_formatted,
59
- add_generation_prompt=True, # Crucial for instruct models
60
  return_dict=True,
61
  return_tensors="pt"
62
- ).to(model.device) # Ensure inputs are on the CPU device
63
- except Exception as e:
64
- print(f"Error applying chat template: {e}")
65
- return f"Error formatting input: {e}"
66
 
67
- print(f"Input tokens: {inputs['input_ids'].shape[1]}") # Log input length
 
 
 
68
 
69
- # Generate response - Use torch.no_grad() to save memory
 
70
  try:
 
 
71
  with torch.no_grad():
72
  output_ids = model.generate(
73
  **inputs,
74
  max_new_tokens=MAX_NEW_TOKENS,
75
- eos_token_id=stop_token_ids, # Use the identified stop token IDs
76
- pad_token_id=tokenizer.eos_token_id, # Often set pad = eos
77
- do_sample=True, # Sample for more diverse outputs
78
- temperature=0.7,
79
- top_p=0.9,
80
- # Note: The original example used 'stop_strings' and passed the tokenizer,
81
- # which isn't standard in `generate`. Using eos_token_id is preferred.
82
- # If specific stopping behavior is needed beyond EOS, StoppingCriteria can be used.
83
  )
 
 
84
  except Exception as e:
85
- print(f"Error during model generation: {e}")
86
- # Clean up GPU memory if an error occurs (though we are on CPU, good practice)
 
 
87
  gc.collect()
88
- # Consider torch.cuda.empty_cache() if GPU was accidentally involved
89
- return f"Error during generation: {e}"
90
 
91
- # Decode only the newly generated tokens
92
- input_length = inputs['input_ids'].shape[1]
93
  new_tokens = output_ids[0, input_length:]
94
  response = tokenizer.decode(new_tokens, skip_special_tokens=True)
95
 
96
- print(f"Output tokens: {len(new_tokens)}") # Log output length
97
- print(f"Raw response: {response}")
98
 
99
- # Clean up memory (especially important in constrained environments)
100
  del inputs
101
  del output_ids
102
- gc.collect()
103
- # Consider torch.cuda.empty_cache() if GPU was involved
 
104
 
105
  return response
106
 
107
  # --- Gradio Interface ---
108
- # Use ChatInterface for a conversational experience
109
- chatbot = gr.Chatbot(label="HyperCLOVA X SEED (0.5B)", height=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
 
111
  demo = gr.ChatInterface(
112
- fn=predict,
113
- chatbot=chatbot,
114
- title="πŸ‡°πŸ‡· HyperCLOVA X SEED (0.5B) - CPU Demo",
115
  description=(
116
- f"Chat with the `naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B` model. "
117
- f"This model excels in Korean language understanding. \n"
118
- f"**Note:** Running on a free CPU Hugging Face Space. Responses will be **slow**. "
119
- f"Max new tokens set to {MAX_NEW_TOKENS}."
120
  ),
121
- examples=[
122
- ["μŠˆλ’°λ”©κ±° 방정식과 μ–‘μžμ—­ν•™μ˜ 관계λ₯Ό μ΅œλŒ€ν•œ μžμ„Ένžˆ μ•Œλ €μ€˜."],
123
- ["넀이버 ν•˜μ΄νΌν΄λ‘œλ°”X에 λŒ€ν•΄ μ„€λͺ…ν•΄μ€˜."],
124
- ["였늘 날씨 μ–΄λ•Œ?"], # Example showing it might not know real-time data
125
- ["κ°„λ‹¨ν•œ 파이썬 μ½”λ“œ 예제λ₯Ό λ³΄μ—¬μ€˜."],
126
- ],
127
- cache_examples=False, # Caching might consume too much memory/disk on free tier
128
- theme="soft",
129
- retry_btn=None,
130
- undo_btn="Delete Previous Turn",
131
- clear_btn="Clear Conversation",
132
  )
133
 
134
  # --- Launch the App ---
135
  if __name__ == "__main__":
136
- # queue() is important for handling multiple users, especially with slow inference
137
- demo.queue().launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import gc
5
+ import os
6
+ import datetime
7
 
8
  # --- Configuration ---
9
  MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
10
+ MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU (adjust as needed)
11
+ CPU_THREAD_COUNT = 4 # Limit threads torch uses on CPU if needed (adjust based on Space CPU core count)
12
+
13
+ # Set PyTorch CPU thread count (optional, might help prevent resource exhaustion)
14
+ # torch.set_num_threads(CPU_THREAD_COUNT)
15
+ # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
16
+ # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
17
+
18
+ print("--- Environment Setup ---")
19
+ print(f"PyTorch version: {torch.__version__}")
20
+ print(f"Running on device: cpu") # Explicitly state we expect CPU
21
+ print(f"Torch Threads: {torch.get_num_threads()}") # Check default threads
22
 
23
  # --- Model Loading ---
24
+ print(f"--- Loading Model: {MODEL_ID} ---")
25
+ print("This might take a few minutes, especially on the first launch...")
26
+
27
  try:
28
+ # Load model explicitly onto CPU with float32 (standard for CPU compatibility)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  MODEL_ID,
31
  torch_dtype=torch.float32, # Use float32 for CPU compatibility
32
+ device_map="cpu" # Explicitly map to CPU
33
+ # low_cpu_mem_usage=True # Can sometimes help on low RAM, but might slow down loading
34
  )
35
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
36
+ model.eval() # Set model to evaluation mode
37
+ print("--- Model and Tokenizer Loaded Successfully on CPU ---")
38
+
39
+ # --- Stop Token Configuration ---
40
+ # Get IDs for specified stop tokens and the standard EOS token
41
+ stop_token_strings = ["<|endofturn|>", "<|stop|>"]
42
+ stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
43
+
44
+ # Ensure the official EOS token is also included if not already present
45
+ if tokenizer.eos_token_id not in stop_token_ids_list:
46
+ stop_token_ids_list.append(tokenizer.eos_token_id)
47
+
48
+ # Remove None values if any token wasn't found (though they should be in this vocab)
49
+ stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
50
+
51
+ if not stop_token_ids_list:
52
+ print("Warning: Could not find any stop token IDs. Using default EOS only.")
53
+ stop_token_ids_list = [tokenizer.eos_token_id]
54
+
55
+ print(f"Using Stop Token IDs: {stop_token_ids_list}")
56
+
57
  except Exception as e:
58
+ print(f"!!! Error loading model: {e}")
59
+ # Clean up memory if partial loading occurred
60
+ del model
61
+ del tokenizer
62
+ gc.collect()
63
+ # Raise a Gradio error to make it visible in the UI
64
+ raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
65
+
66
+
67
+ # --- System Prompt ---
68
+ # Use a dynamic date and the correct model name as per the card example structure
69
+ def get_system_prompt():
70
+ # current_date = datetime.datetime.now().strftime("%Yλ…„ %mμ›” %d일(%a)") # Korean date format
71
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") # English date format is safer for consistency
72
+ return (
73
+ f"- AI μ–Έμ–΄λͺ¨λΈμ˜ 이름은 \"CLOVA X\" 이며 λ„€μ΄λ²„μ—μ„œ λ§Œλ“€μ—ˆλ‹€.\n"
74
+ # f"- μ˜€λŠ˜μ€ {current_date}이닀.\n" # Dynamic date can be added if desired
75
+ f"- μ‚¬μš©μžμ˜ μ§ˆλ¬Έμ— λŒ€ν•΄ μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ ν•œκ΅­μ–΄λ‘œ λ‹΅λ³€ν•΄μ•Ό ν•œλ‹€."
76
+ )
77
 
 
 
 
 
 
 
78
 
79
  # --- Inference Function ---
80
  def predict(message, history):
81
  """
82
  Generates a response using the HyperCLOVAX model based on user message and chat history.
83
+ Handles chat formatting, generation, decoding, and memory management.
84
  """
85
+ system_prompt = get_system_prompt()
86
+
87
+ # 1. Format conversation history according to the model's expected template
88
  chat_history_formatted = [
89
+ {"role": "tool_list", "content": ""}, # Required by the model card example
90
+ {"role": "system", "content": system_prompt}
91
  ]
92
  for user_msg, ai_msg in history:
93
  chat_history_formatted.append({"role": "user", "content": user_msg})
94
+ # Ensure assistant response is included correctly, potentially adding endofturn if needed by template logic,
95
+ # but apply_chat_template usually handles this.
96
+ chat_history_formatted.append({"role": "assistant", "content": ai_msg}) # Append the actual AI response
97
 
98
+ # Add the latest user message
99
  chat_history_formatted.append({"role": "user", "content": message})
100
 
101
+ # 2. Apply the chat template
102
  try:
103
  inputs = tokenizer.apply_chat_template(
104
  chat_history_formatted,
105
+ add_generation_prompt=True, # Crucial for instruction-following models
106
  return_dict=True,
107
  return_tensors="pt"
108
+ ).to(model.device) # Ensure inputs are on the correct device (CPU)
109
+ input_length = inputs['input_ids'].shape[1]
110
+ print(f"\nInput tokens: {input_length}")
111
+ # print(f"Formatted input text (approx): {tokenizer.decode(inputs['input_ids'][0])}") # For debugging
112
 
113
+ except Exception as e:
114
+ print(f"!!! Error applying chat template: {e}")
115
+ # Provide feedback to the user
116
+ return f"였λ₯˜: μž…λ ₯ ν˜•μ‹μ„ μ²˜λ¦¬ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
117
 
118
+ # 3. Generate response using the model
119
+ output_ids = None # Initialize output_ids
120
  try:
121
+ print("Generating response...")
122
+ # Use torch.no_grad() to reduce memory footprint during inference
123
  with torch.no_grad():
124
  output_ids = model.generate(
125
  **inputs,
126
  max_new_tokens=MAX_NEW_TOKENS,
127
+ eos_token_id=stop_token_ids_list, # Use the list of stop token IDs
128
+ pad_token_id=tokenizer.eos_token_id, # Set pad token ID to EOS token ID
129
+ do_sample=True, # Enable sampling for less repetitive output
130
+ temperature=0.7, # Control randomness (lower = more focused)
131
+ top_p=0.9, # Use nucleus sampling
132
+ # num_beams=1, # Use 1 for sampling (greedy is default if do_sample=False)
133
+ # early_stopping=True # Stop generation early if EOS is reached
 
134
  )
135
+ print("Generation complete.")
136
+
137
  except Exception as e:
138
+ print(f"!!! Error during model generation: {e}")
139
+ # Clean up potentially large tensors in case of error
140
+ del inputs
141
+ if output_ids is not None: del output_ids
142
  gc.collect()
143
+ return f"였λ₯˜: 응닡을 μƒμ„±ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
 
144
 
145
+ # 4. Decode the response
146
+ # We need to decode only the newly generated tokens, excluding the input tokens
147
  new_tokens = output_ids[0, input_length:]
148
  response = tokenizer.decode(new_tokens, skip_special_tokens=True)
149
 
150
+ print(f"Output tokens: {len(new_tokens)}")
151
+ # print(f"Raw response: '{response}'") # Log the raw decoded output
152
 
153
+ # 5. Clean up memory
154
  del inputs
155
  del output_ids
156
+ del new_tokens
157
+ gc.collect() # Explicitly run garbage collection
158
+ print("Memory cleaned.")
159
 
160
  return response
161
 
162
  # --- Gradio Interface ---
163
+ print("--- Setting up Gradio Interface ---")
164
+
165
+ # Use ChatInterface for a user-friendly chat experience
166
+ chatbot_component = gr.Chatbot(
167
+ label="HyperCLOVA X SEED (0.5B) λŒ€ν™”",
168
+ bubble_full_width=False,
169
+ height=600
170
+ )
171
+
172
+ # Define examples relevant to the model's strengths (Korean)
173
+ examples = [
174
+ ["넀이버 ν΄λ‘œλ°”XλŠ” λ¬΄μ—‡μΈκ°€μš”?"],
175
+ ["μŠˆλ’°λ”©κ±° 방정식과 μ–‘μžμ—­ν•™μ˜ 관계λ₯Ό μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
176
+ ["λ”₯λŸ¬λ‹ λͺ¨λΈ ν•™μŠ΅ 과정을 λ‹¨κ³„λ³„λ‘œ μ•Œλ €μ€˜."],
177
+ ["μ œμ£Όλ„ μ—¬ν–‰ κ³„νšμ„ μ„Έμš°κ³  μžˆλŠ”λ°, 3λ°• 4일 μΆ”μ²œ μ½”μŠ€ μ’€ μ§œμ€„λž˜?"],
178
+ ]
179
 
180
+ # Create the Gradio ChatInterface
181
  demo = gr.ChatInterface(
182
+ fn=predict, # The function to call for generating responses
183
+ chatbot=chatbot_component, # The chatbot display component
184
+ title="πŸ‡°πŸ‡· 넀이버 HyperCLOVA X SEED (0.5B) 데λͺ¨",
185
  description=(
186
+ f"**λͺ¨λΈ:** {MODEL_ID}\n"
187
+ f"**ν™˜κ²½:** Hugging Face 무료 CPU (16GB RAM)\n"
188
+ f"**주의:** CPUμ—μ„œ μ‹€ν–‰λ˜λ―€λ‘œ 응닡 생성에 λ‹€μ†Œ μ‹œκ°„μ΄ 걸릴 수 μžˆμŠ΅λ‹ˆλ‹€ (특히 첫 응닡). "
189
+ f"μ΅œλŒ€ 생성 토큰 μˆ˜λŠ” {MAX_NEW_TOKENS}개둜 μ œν•œλ©λ‹ˆλ‹€."
190
  ),
191
+ examples=examples,
192
+ cache_examples=False, # Disable caching on free tier to save disk/memory
193
+ theme="soft", # Use a soft theme
194
+ retry_btn="λ‹€μ‹œ μ‹œλ„",
195
+ undo_btn="이전 ν„΄ μ‚­μ œ",
196
+ clear_btn="λŒ€ν™” μ΄ˆκΈ°ν™”",
 
 
 
 
 
197
  )
198
 
199
  # --- Launch the App ---
200
  if __name__ == "__main__":
201
+ print("--- Launching Gradio App ---")
202
+ # queue() is important for handling multiple users, especially with slow inference on CPU
203
+ # Use concurrency_count=1 if resource exhaustion occurs, otherwise default might be okay.
204
+ demo.queue(
205
+ # default_concurrency_limit=1 # Limit concurrent requests if needed
206
+ ).launch(
207
+ # share=False # Set to True to get a public link (requires login)
208
+ # server_name="0.0.0.0" # To make it accessible on the network if running locally
209
+ )