import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForCausalLM import psutil import matplotlib.pyplot as plt import seaborn as sns import time import os from huggingface_hub import login import numpy as np # Streamlit app configuration st.set_page_config(page_title="DeepSeek Tuning App", layout="wide") st.title("DeepSeek Model Tuning for RAM and Context Length") # Sidebar for user inputs st.sidebar.header("Configuration") model_choice = st.sidebar.selectbox( "Select DeepSeek Model", ["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"], help="Select an available DeepSeek model." ) context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024) quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True) hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password") run_button = st.sidebar.button("Run Model") # Function to get RAM usage def get_ram_usage(): return psutil.virtual_memory().percent # Function to install and load the model @st.cache_resource def load_model(model_name, quantize=False, token=None): try: if token: st.write("Logging in to Hugging Face with provided token...") login(token) st.write(f"Loading {model_name}...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token) if quantize and torch.cuda.is_available(): from bitsandbytes import BitsAndBytesConfig bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, quantization_config=bnb_config, device_map="auto", token=token ) else: model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map="auto", token=token ) return model, tokenizer except Exception as e: st.error(f"Error loading model: {str(e)}") st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.") return None, None # Function to tune and run inference def run_inference(model, tokenizer, context_len): ram_usages = [] inference_times = [] prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50) inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len) if torch.cuda.is_available(): inputs = inputs.to("cuda") start_time = time.time() ram_before = get_ram_usage() with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=100) ram_after = get_ram_usage() inference_time = time.time() - start_time result = tokenizer.decode(outputs[0], skip_special_tokens=True) ram_usages.extend([ram_before, ram_after]) inference_times.append(inference_time) return result, ram_usages, inference_times # Visualization function def plot_results(ram_usages, inference_times, context_len): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) # RAM Usage Plot sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1) ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}") ax1.set_ylabel("RAM Usage (%)") # Inference Time Plot sns.barplot(x=["Inference"], y=inference_times, ax=ax2) ax2.set_title("Inference Time (seconds)") ax2.set_ylabel("Time (s)") st.pyplot(fig) # Main execution if run_button: with st.spinner("Installing and tuning the model..."): # Install bitsandbytes if quantization is enabled if quantization and not os.path.exists("./bnb_installed"): st.write("Installing bitsandbytes for quantization...") os.system("pip install bitsandbytes") with open("./bnb_installed", "w") as f: f.write("installed") # Load model model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None) if model is None or tokenizer is None: st.stop() # Tune for max RAM and context length st.write(f"Tuning {model_choice} with context length {context_length}...") # Run inference result, ram_usages, inference_times = run_inference(model, tokenizer, context_length) # Display results st.subheader("Generated Output") st.write(result) st.subheader("Performance Metrics") plot_results(ram_usages, inference_times, context_length) # Additional info st.write(f"Max Context Length Used: {context_length}") st.write(f"Quantization Enabled: {quantization}") st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%") st.write(f"Inference Time: {inference_times[0]:.2f} seconds") # Instructions for user st.markdown(""" ### Instructions 1. Select a DeepSeek model from the sidebar. 2. Adjust the context length (higher values use more RAM). 3. Enable quantization to reduce RAM usage (optional). 4. Provide a Hugging Face token if the model is private. 5. Click 'Run Model' to install, tune, and visualize results. **Note:** Ensure the model name is correct and accessible. """)