ultralight99 commited on
Commit
0f921d3
·
1 Parent(s): d2b9475

Added files

Browse files
Files changed (2) hide show
  1. app.py +53 -58
  2. requirements.txt +3 -2
app.py CHANGED
@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  import time
8
  import os
9
- from vllm import LLM, SamplingParams
10
  import numpy as np
11
 
12
  # Streamlit app configuration
@@ -17,11 +17,12 @@ st.title("DeepSeek Model Tuning for RAM and Context Length")
17
  st.sidebar.header("Configuration")
18
  model_choice = st.sidebar.selectbox(
19
  "Select DeepSeek Model",
20
- ["deepseek-ai/DeepSeek-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V3"],
21
- help="DeepSeek-V3 is 671B params, V2-Lite is more manageable at 15.7B."
22
  )
23
- context_length = st.sidebar.slider("Max Context Length", 1024, 32768, 4096, step=1024)
24
  quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
 
25
  run_button = st.sidebar.button("Run Model")
26
 
27
  # Function to get RAM usage
@@ -30,62 +31,57 @@ def get_ram_usage():
30
 
31
  # Function to install and load the model
32
  @st.cache_resource
33
- def load_model(model_name, quantize=False):
34
  try:
 
 
 
 
35
  st.write(f"Loading {model_name}...")
36
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
37
 
38
- if model_name == "deepseek-ai/DeepSeek-V3":
39
- # For V3, we'll assume vLLM for efficiency (requires setup)
40
- llm = LLM(model=model_name, max_model_len=context_length, tensor_parallel_size=1)
41
- return llm, tokenizer
 
 
 
 
 
 
42
  else:
43
- # For V2-Lite, use transformers with quantization if selected
44
- if quantize:
45
- model = AutoModelForCausalLM.from_pretrained(
46
- model_name,
47
- trust_remote_code=True,
48
- torch_dtype=torch.bfloat16,
49
- device_map="auto",
50
- load_in_4bit=True
51
- )
52
- else:
53
- model = AutoModelForCausalLM.from_pretrained(
54
- model_name,
55
- trust_remote_code=True,
56
- torch_dtype=torch.bfloat16,
57
- device_map="auto"
58
- )
59
- return model, tokenizer
60
  except Exception as e:
61
  st.error(f"Error loading model: {str(e)}")
 
62
  return None, None
63
 
64
  # Function to tune and run inference
65
- def run_inference(model, tokenizer, context_len, model_name):
66
  ram_usages = []
67
  inference_times = []
68
- prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50) # Scale prompt to context length
 
 
 
 
69
 
70
- if model_name == "deepseek-ai/DeepSeek-V3":
71
- # vLLM inference
72
- sampling_params = SamplingParams(max_tokens=100, temperature=0.7)
73
- start_time = time.time()
74
- ram_before = get_ram_usage()
75
- outputs = model.generate([prompt], sampling_params)
76
- ram_after = get_ram_usage()
77
- inference_time = time.time() - start_time
78
- result = outputs[0].outputs[0].text
79
- else:
80
- # Transformers inference
81
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len).to("cuda")
82
- start_time = time.time()
83
- ram_before = get_ram_usage()
84
  outputs = model.generate(**inputs, max_new_tokens=100)
85
- ram_after = get_ram_usage()
86
- inference_time = time.time() - start_time
87
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
 
 
89
  ram_usages.extend([ram_before, ram_after])
90
  inference_times.append(inference_time)
91
  return result, ram_usages, inference_times
@@ -109,25 +105,23 @@ def plot_results(ram_usages, inference_times, context_len):
109
  # Main execution
110
  if run_button:
111
  with st.spinner("Installing and tuning the model..."):
112
- # Install dependencies if needed (for Hugging Face Space, assume pre-installed)
113
- if not os.path.exists("./vllm_installed"):
114
- st.write("Installing vLLM for DeepSeek-V3 support...")
115
- os.system("pip install vllm")
116
- with open("./vllm_installed", "w") as f:
117
  f.write("installed")
118
 
119
  # Load model
120
- model, tokenizer = load_model(model_choice, quantization)
121
  if model is None or tokenizer is None:
122
  st.stop()
123
 
124
  # Tune for max RAM and context length
125
  st.write(f"Tuning {model_choice} with context length {context_length}...")
126
- if model_choice == "deepseek-ai/DeepSeek-V3":
127
- st.warning("DeepSeek-V3 requires significant GPU resources. Ensure proper setup.")
128
 
129
  # Run inference
130
- result, ram_usages, inference_times = run_inference(model, tokenizer, context_length, model_choice)
131
 
132
  # Display results
133
  st.subheader("Generated Output")
@@ -145,9 +139,10 @@ if run_button:
145
  # Instructions for user
146
  st.markdown("""
147
  ### Instructions
148
- 1. Select the DeepSeek model from the sidebar.
149
  2. Adjust the context length (higher values use more RAM).
150
  3. Enable quantization to reduce RAM usage (optional).
151
- 4. Click 'Run Model' to install, tune, and visualize results.
152
- **Note:** DeepSeek-V3 (671B) requires high-end hardware. Use V2-Lite for moderate setups.
 
153
  """)
 
6
  import seaborn as sns
7
  import time
8
  import os
9
+ from huggingface_hub import login
10
  import numpy as np
11
 
12
  # Streamlit app configuration
 
17
  st.sidebar.header("Configuration")
18
  model_choice = st.sidebar.selectbox(
19
  "Select DeepSeek Model",
20
+ ["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"],
21
+ help="Select an available DeepSeek model."
22
  )
23
+ context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024)
24
  quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
25
+ hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password")
26
  run_button = st.sidebar.button("Run Model")
27
 
28
  # Function to get RAM usage
 
31
 
32
  # Function to install and load the model
33
  @st.cache_resource
34
+ def load_model(model_name, quantize=False, token=None):
35
  try:
36
+ if token:
37
+ st.write("Logging in to Hugging Face with provided token...")
38
+ login(token)
39
+
40
  st.write(f"Loading {model_name}...")
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)
42
 
43
+ if quantize and torch.cuda.is_available():
44
+ from bitsandbytes import BitsAndBytesConfig
45
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
46
+ model = AutoModelForCausalLM.from_pretrained(
47
+ model_name,
48
+ trust_remote_code=True,
49
+ quantization_config=bnb_config,
50
+ device_map="auto",
51
+ token=token
52
+ )
53
  else:
54
+ model = AutoModelForCausalLM.from_pretrained(
55
+ model_name,
56
+ trust_remote_code=True,
57
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
58
+ device_map="auto",
59
+ token=token
60
+ )
61
+ return model, tokenizer
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
  st.error(f"Error loading model: {str(e)}")
64
+ st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.")
65
  return None, None
66
 
67
  # Function to tune and run inference
68
+ def run_inference(model, tokenizer, context_len):
69
  ram_usages = []
70
  inference_times = []
71
+ prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50)
72
+
73
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len)
74
+ if torch.cuda.is_available():
75
+ inputs = inputs.to("cuda")
76
 
77
+ start_time = time.time()
78
+ ram_before = get_ram_usage()
79
+ with torch.no_grad():
 
 
 
 
 
 
 
 
 
 
 
80
  outputs = model.generate(**inputs, max_new_tokens=100)
81
+ ram_after = get_ram_usage()
82
+ inference_time = time.time() - start_time
 
83
 
84
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
85
  ram_usages.extend([ram_before, ram_after])
86
  inference_times.append(inference_time)
87
  return result, ram_usages, inference_times
 
105
  # Main execution
106
  if run_button:
107
  with st.spinner("Installing and tuning the model..."):
108
+ # Install bitsandbytes if quantization is enabled
109
+ if quantization and not os.path.exists("./bnb_installed"):
110
+ st.write("Installing bitsandbytes for quantization...")
111
+ os.system("pip install bitsandbytes")
112
+ with open("./bnb_installed", "w") as f:
113
  f.write("installed")
114
 
115
  # Load model
116
+ model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None)
117
  if model is None or tokenizer is None:
118
  st.stop()
119
 
120
  # Tune for max RAM and context length
121
  st.write(f"Tuning {model_choice} with context length {context_length}...")
 
 
122
 
123
  # Run inference
124
+ result, ram_usages, inference_times = run_inference(model, tokenizer, context_length)
125
 
126
  # Display results
127
  st.subheader("Generated Output")
 
139
  # Instructions for user
140
  st.markdown("""
141
  ### Instructions
142
+ 1. Select a DeepSeek model from the sidebar.
143
  2. Adjust the context length (higher values use more RAM).
144
  3. Enable quantization to reduce RAM usage (optional).
145
+ 4. Provide a Hugging Face token if the model is private.
146
+ 5. Click 'Run Model' to install, tune, and visualize results.
147
+ **Note:** Ensure the model name is correct and accessible.
148
  """)
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
  torch
2
  transformers
3
- vllm
4
  psutil
5
  matplotlib
6
  seaborn
7
  streamlit
8
- numpy
 
 
1
  torch
2
  transformers
3
+ bitsandbytes
4
  psutil
5
  matplotlib
6
  seaborn
7
  streamlit
8
+ numpy
9
+ huggingface_hub