Tonic commited on
Commit
8c33a08
unverified
1 Parent(s): 098ce94

memory efficient loading

Browse files
Files changed (1) hide show
  1. app.py +37 -26
app.py CHANGED
@@ -15,43 +15,57 @@ Join us : 馃専TeamTonic馃専 is always making cool demos! Join our active builder
15
 
16
  model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
17
 
18
- # Define quantization config for 4-bit
19
  quantization_config = BitsAndBytesConfig(
20
- load_in_4bit=True, # Enable 4-bit quantization
21
- bnb_4bit_quant_type="fp4", # Use FP4 quantization
22
- bnb_4bit_use_double_quant=True, # Optional: double quantization for better precision
23
- # llm_int8_enable_fp32_cpu_offload=True # Allow CPU offloading for 32-bit modules
24
- bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
27
- # # Custom device map to offload non-critical components
28
- # custom_device_map = {
29
- # "transformer": "cuda", # Keep transformer layers on GPU
30
- # "lm_head": "cpu", # Offload language model head to CPU
31
- # }
 
 
32
 
33
  # Load tokenizer and model
34
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
- quantization_config=quantization_config, # Apply quantization
38
- device_map="auto", # Automatically map to available devices
39
- # device_map=custom_device_map, # Use custom device map
40
  torch_dtype=torch.bfloat16,
41
  token=HF_TOKEN,
42
- max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
43
  )
44
 
45
  @spaces.GPU
46
  def generate_response(user_input, max_new_tokens, temperature):
47
  messages = [{"role": "user", "content": user_input}]
48
- input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
49
- input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu") # Dynamic device placement
 
 
 
 
 
 
 
 
 
 
50
  gen_tokens = model.generate(
51
- input_ids = input_ids,
52
  max_new_tokens=max_new_tokens,
53
- do_sample=True,
54
  temperature=temperature,
 
 
 
55
  )
56
 
57
  gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
@@ -60,12 +74,10 @@ def generate_response(user_input, max_new_tokens, temperature):
60
 
61
  return gen_text
62
 
63
-
64
-
65
  examples = [
66
- {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
67
- {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
68
- {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
69
  ]
70
  example_choices = [f"Example {i+1}" for i in range(len(examples))]
71
 
@@ -74,7 +86,6 @@ def load_example(choice):
74
  example = examples[index]
75
  return example["message"], example["max_new_tokens"], example["temperature"]
76
 
77
-
78
  with gr.Blocks() as demo:
79
  gr.Markdown(title)
80
  with gr.Row():
@@ -97,4 +108,4 @@ with gr.Blocks() as demo:
97
  outputs=[message_box, max_new_tokens_slider, temperature_slider]
98
  )
99
 
100
- demo.launch(ssr_mode=False)
 
15
 
16
  model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
17
 
18
+ # Define quantization config with CPU offloading support
19
  quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_quant_type="fp4",
22
+ bnb_4bit_use_double_quant=True,
23
+ bnb_4bit_compute_dtype=torch.bfloat16,
24
+ llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading
25
  )
26
 
27
+ # Custom device map to split model across GPU and CPU
28
+ custom_device_map = {
29
+ "transformer.word_embeddings": "cuda",
30
+ "transformer.h": "cuda", # Main transformer layers on GPU
31
+ "transformer.ln_f": "cpu", # Layer norm to CPU
32
+ "lm_head": "cpu" # Language model head to CPU
33
+ }
34
 
35
  # Load tokenizer and model
36
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
37
  model = AutoModelForCausalLM.from_pretrained(
38
  model_id,
39
+ quantization_config=quantization_config,
40
+ device_map=custom_device_map, # Use custom device mapping
 
41
  torch_dtype=torch.bfloat16,
42
  token=HF_TOKEN,
43
+ max_position_embeddings=8192 # Adjusted to 8k tokens for memory efficiency
44
  )
45
 
46
  @spaces.GPU
47
  def generate_response(user_input, max_new_tokens, temperature):
48
  messages = [{"role": "user", "content": user_input}]
49
+ input_ids = tokenizer.apply_chat_template(
50
+ messages,
51
+ tokenize=True,
52
+ add_generation_prompt=True,
53
+ return_tensors="pt"
54
+ )
55
+
56
+ # Move inputs to GPU if available
57
+ device = "cuda" if torch.cuda.is_available() else "cpu"
58
+ input_ids = input_ids.to(device)
59
+
60
+ # Generate with memory-efficient settings
61
  gen_tokens = model.generate(
62
+ input_ids=input_ids,
63
  max_new_tokens=max_new_tokens,
64
+ do_sample=True,
65
  temperature=temperature,
66
+ pad_token_id=tokenizer.eos_token_id,
67
+ # Add memory-efficient parameters
68
+ max_length=min(4000, max_new_tokens + input_ids.shape[-1]), # Cap at context length
69
  )
70
 
71
  gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
 
74
 
75
  return gen_text
76
 
 
 
77
  examples = [
78
+ {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
79
+ {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
80
+ {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
81
  ]
82
  example_choices = [f"Example {i+1}" for i in range(len(examples))]
83
 
 
86
  example = examples[index]
87
  return example["message"], example["max_new_tokens"], example["temperature"]
88
 
 
89
  with gr.Blocks() as demo:
90
  gr.Markdown(title)
91
  with gr.Row():
 
108
  outputs=[message_box, max_new_tokens_slider, temperature_slider]
109
  )
110
 
111
+ demo.launch(ssr_mode=False)