Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import psutil | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import time | |
import os | |
from huggingface_hub import login | |
import numpy as np | |
# Streamlit app configuration | |
st.set_page_config(page_title="DeepSeek Tuning App", layout="wide") | |
st.title("DeepSeek Model Tuning for RAM and Context Length") | |
# Sidebar for user inputs | |
st.sidebar.header("Configuration") | |
model_choice = st.sidebar.selectbox( | |
"Select DeepSeek Model", | |
["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"], | |
help="Select an available DeepSeek model." | |
) | |
context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024) | |
quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True) | |
hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password") | |
run_button = st.sidebar.button("Run Model") | |
# Function to get RAM usage | |
def get_ram_usage(): | |
return psutil.virtual_memory().percent | |
# Function to install and load the model | |
def load_model(model_name, quantize=False, token=None): | |
try: | |
if token: | |
st.write("Logging in to Hugging Face with provided token...") | |
login(token) | |
st.write(f"Loading {model_name}...") | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token) | |
if quantize and torch.cuda.is_available(): | |
from bitsandbytes import BitsAndBytesConfig | |
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
quantization_config=bnb_config, | |
device_map="auto", | |
token=token | |
) | |
else: | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
device_map="auto", | |
token=token | |
) | |
return model, tokenizer | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.") | |
return None, None | |
# Function to tune and run inference | |
def run_inference(model, tokenizer, context_len): | |
ram_usages = [] | |
inference_times = [] | |
prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50) | |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len) | |
if torch.cuda.is_available(): | |
inputs = inputs.to("cuda") | |
start_time = time.time() | |
ram_before = get_ram_usage() | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
ram_after = get_ram_usage() | |
inference_time = time.time() - start_time | |
result = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
ram_usages.extend([ram_before, ram_after]) | |
inference_times.append(inference_time) | |
return result, ram_usages, inference_times | |
# Visualization function | |
def plot_results(ram_usages, inference_times, context_len): | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) | |
# RAM Usage Plot | |
sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1) | |
ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}") | |
ax1.set_ylabel("RAM Usage (%)") | |
# Inference Time Plot | |
sns.barplot(x=["Inference"], y=inference_times, ax=ax2) | |
ax2.set_title("Inference Time (seconds)") | |
ax2.set_ylabel("Time (s)") | |
st.pyplot(fig) | |
# Main execution | |
if run_button: | |
with st.spinner("Installing and tuning the model..."): | |
# Install bitsandbytes if quantization is enabled | |
if quantization and not os.path.exists("./bnb_installed"): | |
st.write("Installing bitsandbytes for quantization...") | |
os.system("pip install bitsandbytes") | |
with open("./bnb_installed", "w") as f: | |
f.write("installed") | |
# Load model | |
model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None) | |
if model is None or tokenizer is None: | |
st.stop() | |
# Tune for max RAM and context length | |
st.write(f"Tuning {model_choice} with context length {context_length}...") | |
# Run inference | |
result, ram_usages, inference_times = run_inference(model, tokenizer, context_length) | |
# Display results | |
st.subheader("Generated Output") | |
st.write(result) | |
st.subheader("Performance Metrics") | |
plot_results(ram_usages, inference_times, context_length) | |
# Additional info | |
st.write(f"Max Context Length Used: {context_length}") | |
st.write(f"Quantization Enabled: {quantization}") | |
st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%") | |
st.write(f"Inference Time: {inference_times[0]:.2f} seconds") | |
# Instructions for user | |
st.markdown(""" | |
### Instructions | |
1. Select a DeepSeek model from the sidebar. | |
2. Adjust the context length (higher values use more RAM). | |
3. Enable quantization to reduce RAM usage (optional). | |
4. Provide a Hugging Face token if the model is private. | |
5. Click 'Run Model' to install, tune, and visualize results. | |
**Note:** Ensure the model name is correct and accessible. | |
""") |