Spaces:

ultralight99
/

training_deepseek

Sleeping

App Files Files Community

ultralight99 commited on Mar 3

Commit

0f921d3

1 Parent(s): d2b9475

Added files

Browse files

Files changed (2) hide show

app.py +53 -58
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import time
 import os
-from vllm import LLM, SamplingParams
 import numpy as np
 # Streamlit app configuration
@@ -17,11 +17,12 @@ st.title("DeepSeek Model Tuning for RAM and Context Length")
 st.sidebar.header("Configuration")
 model_choice = st.sidebar.selectbox(
     "Select DeepSeek Model",
-    ["deepseek-ai/DeepSeek-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V3"],
-    help="DeepSeek-V3 is 671B params, V2-Lite is more manageable at 15.7B."
 )
-context_length = st.sidebar.slider("Max Context Length", 1024, 32768, 4096, step=1024)
 quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
 run_button = st.sidebar.button("Run Model")
 # Function to get RAM usage
@@ -30,62 +31,57 @@ def get_ram_usage():
 # Function to install and load the model
 @st.cache_resource
-def load_model(model_name, quantize=False):
     try:
         st.write(f"Loading {model_name}...")
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        if model_name == "deepseek-ai/DeepSeek-V3":
-            # For V3, we'll assume vLLM for efficiency (requires setup)
-            llm = LLM(model=model_name, max_model_len=context_length, tensor_parallel_size=1)
-            return llm, tokenizer
         else:
-            # For V2-Lite, use transformers with quantization if selected
-            if quantize:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    trust_remote_code=True,
-                    torch_dtype=torch.bfloat16,
-                    device_map="auto",
-                    load_in_4bit=True
-                )
-            else:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    trust_remote_code=True,
-                    torch_dtype=torch.bfloat16,
-                    device_map="auto"
-                )
-            return model, tokenizer
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
         return None, None
 # Function to tune and run inference
-def run_inference(model, tokenizer, context_len, model_name):
     ram_usages = []
     inference_times = []
-    prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50)  # Scale prompt to context length
-    if model_name == "deepseek-ai/DeepSeek-V3":
-        # vLLM inference
-        sampling_params = SamplingParams(max_tokens=100, temperature=0.7)
-        start_time = time.time()
-        ram_before = get_ram_usage()
-        outputs = model.generate([prompt], sampling_params)
-        ram_after = get_ram_usage()
-        inference_time = time.time() - start_time
-        result = outputs[0].outputs[0].text
-    else:
-        # Transformers inference
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len).to("cuda")
-        start_time = time.time()
-        ram_before = get_ram_usage()
         outputs = model.generate(**inputs, max_new_tokens=100)
-        ram_after = get_ram_usage()
-        inference_time = time.time() - start_time
-        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
     ram_usages.extend([ram_before, ram_after])
     inference_times.append(inference_time)
     return result, ram_usages, inference_times
@@ -109,25 +105,23 @@ def plot_results(ram_usages, inference_times, context_len):
 # Main execution
 if run_button:
     with st.spinner("Installing and tuning the model..."):
-        # Install dependencies if needed (for Hugging Face Space, assume pre-installed)
-        if not os.path.exists("./vllm_installed"):
-            st.write("Installing vLLM for DeepSeek-V3 support...")
-            os.system("pip install vllm")
-            with open("./vllm_installed", "w") as f:
                 f.write("installed")
         # Load model
-        model, tokenizer = load_model(model_choice, quantization)
         if model is None or tokenizer is None:
             st.stop()
         # Tune for max RAM and context length
         st.write(f"Tuning {model_choice} with context length {context_length}...")
-        if model_choice == "deepseek-ai/DeepSeek-V3":
-            st.warning("DeepSeek-V3 requires significant GPU resources. Ensure proper setup.")
         # Run inference
-        result, ram_usages, inference_times = run_inference(model, tokenizer, context_length, model_choice)
         # Display results
         st.subheader("Generated Output")
@@ -145,9 +139,10 @@ if run_button:
 # Instructions for user
 st.markdown("""
 ### Instructions
-1. Select the DeepSeek model from the sidebar.
 2. Adjust the context length (higher values use more RAM).
 3. Enable quantization to reduce RAM usage (optional).
-4. Click 'Run Model' to install, tune, and visualize results.
-**Note:** DeepSeek-V3 (671B) requires high-end hardware. Use V2-Lite for moderate setups.
 """)

 import seaborn as sns
 import time
 import os
+from huggingface_hub import login
 import numpy as np
 # Streamlit app configuration
 st.sidebar.header("Configuration")
 model_choice = st.sidebar.selectbox(
     "Select DeepSeek Model",
+    ["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"],
+    help="Select an available DeepSeek model."
 )
+context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024)
 quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
+hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password")
 run_button = st.sidebar.button("Run Model")
 # Function to get RAM usage
 # Function to install and load the model
 @st.cache_resource
+def load_model(model_name, quantize=False, token=None):
     try:
+        if token:
+            st.write("Logging in to Hugging Face with provided token...")
+            login(token)
         st.write(f"Loading {model_name}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)
+        if quantize and torch.cuda.is_available():
+            from bitsandbytes import BitsAndBytesConfig
+            bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                quantization_config=bnb_config,
+                device_map="auto",
+                token=token
+            )
         else:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto",
+                token=token
+            )
+        return model, tokenizer
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
+        st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.")
         return None, None
 # Function to tune and run inference
+def run_inference(model, tokenizer, context_len):
     ram_usages = []
     inference_times = []
+    prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50)
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len)
+    if torch.cuda.is_available():
+        inputs = inputs.to("cuda")
+    start_time = time.time()
+    ram_before = get_ram_usage()
+    with torch.no_grad():
         outputs = model.generate(**inputs, max_new_tokens=100)
+    ram_after = get_ram_usage()
+    inference_time = time.time() - start_time
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
     ram_usages.extend([ram_before, ram_after])
     inference_times.append(inference_time)
     return result, ram_usages, inference_times
 # Main execution
 if run_button:
     with st.spinner("Installing and tuning the model..."):
+        # Install bitsandbytes if quantization is enabled
+        if quantization and not os.path.exists("./bnb_installed"):
+            st.write("Installing bitsandbytes for quantization...")
+            os.system("pip install bitsandbytes")
+            with open("./bnb_installed", "w") as f:
                 f.write("installed")
         # Load model
+        model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None)
         if model is None or tokenizer is None:
             st.stop()
         # Tune for max RAM and context length
         st.write(f"Tuning {model_choice} with context length {context_length}...")
         # Run inference
+        result, ram_usages, inference_times = run_inference(model, tokenizer, context_length)
         # Display results
         st.subheader("Generated Output")
 # Instructions for user
 st.markdown("""
 ### Instructions
+1. Select a DeepSeek model from the sidebar.
 2. Adjust the context length (higher values use more RAM).
 3. Enable quantization to reduce RAM usage (optional).
+4. Provide a Hugging Face token if the model is private.
+5. Click 'Run Model' to install, tune, and visualize results.
+**Note:** Ensure the model name is correct and accessible.
 """)

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 torch
 transformers
-vllm
 psutil
 matplotlib
 seaborn
 streamlit
-numpy

 torch
 transformers
+bitsandbytes
 psutil
 matplotlib
 seaborn
 streamlit
+numpy
+huggingface_hub