import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os

# --- Environment and PyTorch Configurations (Kept from your original code) ---
# ... (rest of your os.putenv and torch.backends settings) ...
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")

# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"

# ** DOCUMENTATION: Quantization Configuration **
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
# This is useful if you're VRAM-constrained.

# Example for 4-bit quantization:
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,    # Optional: Improves precision slightly but uses a bit more memory
    bnb_4bit_quant_type="nf4",         # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
                                        # bfloat16 is good if your GPU supports it (Ampere series onwards)
)

# Example for 8-bit quantization (if you prefer that over 4-bit):
# print("Setting up 8-bit quantization config...")
# quantization_config_8bit = BitsAndBytesConfig(
#     load_in_8bit=True
# )

# ** DOCUMENTATION: Model Loading with Quantization **
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_4bit, # Pass the config here
    device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
                       # It automatically distributes the model across available GPUs/CPU memory as needed.
                       # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
    # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
                          # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
    # trust_remote_code=True # As discussed, generally not needed for Vicuna
)

print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # trust_remote_code=True,
    use_fast=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")

# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
# and .to(input_device)

# ... (rest of your generate_code function and Gradio app code) ...
# Make sure to adjust the device placement for model_inputs if needed,
# though often `model.generate` handles this correctly when `device_map` is used.

@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful and proficient coding assistant."},
        {"role": "user", "content": prompt}
    ]
    try:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        return f"Error: Could not apply chat template. ({e})"

    # Determine the device for inputs. If model is on multiple devices,
    # inputs typically go to the device of the first part of the model.
    # With device_map="auto", model.device might not be straightforward.
    # model.generate usually handles input placement correctly.
    # If you face issues, you might need to explicitly find the input device:
    #   input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
    #   model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
    # For now, let's assume .to(model.device) works or generate handles it.
    # If model.device is not available due to device_map, remove .to(model.device)
    # and let `generate` handle it, or use the hf_device_map.
    
    # Since device_map="auto" is used, the model might be on multiple devices.
    # We don't need to explicitly move model_inputs to model.device here,
    # as the `generate` function should handle it correctly with `device_map`.
    model_inputs = tokenizer([text], return_tensors="pt")


    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
            attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
            max_new_tokens=1024,
            min_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # The rest of your generate_code function for decoding should be fine
    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
    response = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response.strip()

# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
    with gr.Tab("Code Chat"):
        gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
        with gr.Row():
            prompt = gr.Textbox( 
                label="Prompt",
                show_label=True, 
                lines=3,        
                placeholder="Enter your coding prompt here...",
            )
        run_button = gr.Button("Generate Code", variant="primary")
        with gr.Row():
            result = gr.Code( 
                label="Generated Code",
                show_label=True, 
                language="python", 
                lines=20,
            )
        gr.on(
            triggers=[
                run_button.click,
                prompt.submit 
            ],
            fn=generate_code,
            inputs=[prompt],
            outputs=[result],
        )

if __name__ == "__main__":
    demo.launch(share=False, debug=True)