Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import snapshot_download | |
from accelerate.utils import BnbQuantizationConfig | |
from accelerate.utils import load_and_quantize_model | |
from accelerate import Accelerator | |
from accelerate import init_empty_weights | |
#from mingpt.model import GPT | |
model_path="marcsun13/gpt2-xl-linear-sharded" | |
def quantize(model_path=model_path): | |
print("1") | |
weights_location = snapshot_download(repo_id=f"{model_path}") | |
print("2") | |
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold = 6) | |
#bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4") | |
print("3") | |
#model_config = GPT.get_default_config() | |
#model_config.model_type = 'gpt2-xl' | |
#model_config.vocab_size = 50257 | |
#model_config.block_size = 1024 | |
print(weights_location) | |
print(weights_location.config) | |
with init_empty_weights(): | |
empty_model = weights_location(model_config) | |
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto") | |
print("4") | |
accelerate = Accelerator() | |
print("5") | |
new_weights_location = "./model" | |
print("6") | |
accelerate.save_model(quantized_model, new_weights_location) | |
print("7") | |
quantized_model_from_saved = load_and_quantize_model(empty_model, weights_location=new_weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto") | |
print("Done") | |
with gr.Blocks() as app: | |
btn=gr.Button() | |
btn.click(quantize,None,None) | |
app.launch() |