import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import psutil
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
from huggingface_hub import login
import numpy as np

# Streamlit app configuration
st.set_page_config(page_title="DeepSeek Tuning App", layout="wide")
st.title("DeepSeek Model Tuning for RAM and Context Length")

# Sidebar for user inputs
st.sidebar.header("Configuration")
model_choice = st.sidebar.selectbox(
    "Select DeepSeek Model", 
    ["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"],
    help="Select an available DeepSeek model."
)
context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024)
quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True)
hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password")
run_button = st.sidebar.button("Run Model")

# Function to get RAM usage
def get_ram_usage():
    return psutil.virtual_memory().percent

# Function to install and load the model
@st.cache_resource
def load_model(model_name, quantize=False, token=None):
    try:
        if token:
            st.write("Logging in to Hugging Face with provided token...")
            login(token)
        
        st.write(f"Loading {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)
        
        if quantize and torch.cuda.is_available():
            from bitsandbytes import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
            model = AutoModelForCausalLM.from_pretrained(
                model_name, 
                trust_remote_code=True, 
                quantization_config=bnb_config,
                device_map="auto",
                token=token
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name, 
                trust_remote_code=True, 
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                device_map="auto",
                token=token
            )
        return model, tokenizer
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.")
        return None, None

# Function to tune and run inference
def run_inference(model, tokenizer, context_len):
    ram_usages = []
    inference_times = []
    prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50)
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len)
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")
    
    start_time = time.time()
    ram_before = get_ram_usage()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)
    ram_after = get_ram_usage()
    inference_time = time.time() - start_time
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    ram_usages.extend([ram_before, ram_after])
    inference_times.append(inference_time)
    return result, ram_usages, inference_times

# Visualization function
def plot_results(ram_usages, inference_times, context_len):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # RAM Usage Plot
    sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1)
    ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}")
    ax1.set_ylabel("RAM Usage (%)")
    
    # Inference Time Plot
    sns.barplot(x=["Inference"], y=inference_times, ax=ax2)
    ax2.set_title("Inference Time (seconds)")
    ax2.set_ylabel("Time (s)")
    
    st.pyplot(fig)

# Main execution
if run_button:
    with st.spinner("Installing and tuning the model..."):
        # Install bitsandbytes if quantization is enabled
        if quantization and not os.path.exists("./bnb_installed"):
            st.write("Installing bitsandbytes for quantization...")
            os.system("pip install bitsandbytes")
            with open("./bnb_installed", "w") as f:
                f.write("installed")
        
        # Load model
        model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None)
        if model is None or tokenizer is None:
            st.stop()
        
        # Tune for max RAM and context length
        st.write(f"Tuning {model_choice} with context length {context_length}...")
        
        # Run inference
        result, ram_usages, inference_times = run_inference(model, tokenizer, context_length)
        
        # Display results
        st.subheader("Generated Output")
        st.write(result)
        
        st.subheader("Performance Metrics")
        plot_results(ram_usages, inference_times, context_length)
        
        # Additional info
        st.write(f"Max Context Length Used: {context_length}")
        st.write(f"Quantization Enabled: {quantization}")
        st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%")
        st.write(f"Inference Time: {inference_times[0]:.2f} seconds")

# Instructions for user
st.markdown("""
### Instructions
1. Select a DeepSeek model from the sidebar.
2. Adjust the context length (higher values use more RAM).
3. Enable quantization to reduce RAM usage (optional).
4. Provide a Hugging Face token if the model is private.
5. Click 'Run Model' to install, tune, and visualize results.
**Note:** Ensure the model name is correct and accessible.
""")