Spaces:

ford442
/

vicuna-coder-33b

Running on Zero

App Files Files Community

vicuna-coder-33b / app.py

ford442

Update app.py

30e82da verified 2 days ago

raw

history blame

7.19 kB

	import spaces # If using Hugging Face Spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
	import torch
	import gradio as gr
	import os

	# --- Environment and PyTorch Configurations (Kept from your original code) ---
	# ... (rest of your os.putenv and torch.backends settings) ...
	os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
	os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
	os.environ["SAFETENSORS_FAST_GPU"] = "1"
	os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = False
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = True
	torch.set_float32_matmul_precision("highest")

	# --- Model and Tokenizer Configuration ---
	model_name = "FelixChao/vicuna-33b-coder"

	# DOCUMENTATION: Quantization Configuration
	# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
	# This is useful if you're VRAM-constrained.

	# Example for 4-bit quantization:
	print("Setting up 4-bit quantization config...")
	quantization_config_4bit = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory
	bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
	bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
	# bfloat16 is good if your GPU supports it (Ampere series onwards)
	)

	# Example for 8-bit quantization (if you prefer that over 4-bit):
	# print("Setting up 8-bit quantization config...")
	# quantization_config_8bit = BitsAndBytesConfig(
	# load_in_8bit=True
	# )

	# DOCUMENTATION: Model Loading with Quantization
	print(f"Loading model: {model_name} with quantization")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=quantization_config_4bit, # Pass the config here
	device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
	# It automatically distributes the model across available GPUs/CPU memory as needed.
	# Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
	# torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
	# but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
	# trust_remote_code=True # As discussed, generally not needed for Vicuna
	)

	print(f"Loading tokenizer: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	# trust_remote_code=True,
	use_fast=True
	)

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")

	# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
	# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
	# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
	# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
	# and .to(input_device)

	# ... (rest of your generate_code function and Gradio app code) ...
	# Make sure to adjust the device placement for model_inputs if needed,
	# though often `model.generate` handles this correctly when `device_map` is used.

	@spaces.GPU(required=True)
	def generate_code(prompt: str) -> str:
	messages = [
	{"role": "system", "content": "You are a helpful and proficient coding assistant."},
	{"role": "user", "content": prompt}
	]
	try:
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	except Exception as e:
	print(f"Error applying chat template: {e}")
	return f"Error: Could not apply chat template. ({e})"

	# Determine the device for inputs. If model is on multiple devices,
	# inputs typically go to the device of the first part of the model.
	# With device_map="auto", model.device might not be straightforward.
	# model.generate usually handles input placement correctly.
	# If you face issues, you might need to explicitly find the input device:
	# input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
	# model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
	# For now, let's assume .to(model.device) works or generate handles it.
	# If model.device is not available due to device_map, remove .to(model.device)
	# and let `generate` handle it, or use the hf_device_map.

	# Since device_map="auto" is used, the model might be on multiple devices.
	# We don't need to explicitly move model_inputs to model.device here,
	# as the `generate` function should handle it correctly with `device_map`.
	model_inputs = tokenizer([text], return_tensors="pt")


	with torch.no_grad():
	generated_ids = model.generate(
	input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
	attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
	max_new_tokens=1024,
	min_new_tokens=256,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id
	)

	# The rest of your generate_code function for decoding should be fine
	response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
	response = tokenizer.decode(response_ids, skip_special_tokens=True)
	return response.strip()

	# --- Gradio Interface (Kept mostly from your original code) ---
	with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
	with gr.Tab("Code Chat"):
	gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
	with gr.Row():
	prompt = gr.Textbox(
	label="Prompt",
	show_label=True,
	lines=3,
	placeholder="Enter your coding prompt here...",
	)
	run_button = gr.Button("Generate Code", variant="primary")
	with gr.Row():
	result = gr.Code(
	label="Generated Code",
	show_label=True,
	language="python",
	lines=20,
	)
	gr.on(
	triggers=[
	run_button.click,
	prompt.submit
	],
	fn=generate_code,
	inputs=[prompt],
	outputs=[result],
	)

	if __name__ == "__main__":
	demo.launch(share=False, debug=True)