# -*- coding: utf-8 -*- """Hugging Face Space App with INT8 Quantization""" import os import gradio as gr from huggingface_hub import login from transformers import AutoTokenizer, AutoModelForCausalLM # 登錄 Hugging Face,使用訪問令牌進行身份驗證 HF_TOKEN = os.getenv("HF_TOKEN") # 從環境變數中獲取訪問令牌 if not HF_TOKEN: raise ValueError( "未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。" ) login(HF_TOKEN) # 使用訪問令牌進行身份驗證 # 加載量化的 Llama-2-13b-chat-hf 模型 MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" # 啟用量化選項 model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", # 自動分配設備(CPU/GPU) load_in_8bit=True, # 啟用 INT8 量化 use_auth_token=HF_TOKEN # 使用 Hugging Face 訪問令牌 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN) # 定義推理函數 def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( inputs.input_ids, max_length=200, num_beams=5, repetition_penalty=1.2, early_stopping=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # 使用 Gradio 構建界面 interface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), outputs="text", title="Llama 2 Text Generator (INT8 Quantized)", description="Generate text using the INT8-quantized Llama-2-13b-chat-hf model hosted on Hugging Face Spaces." ) # 啟動應用 if __name__ == "__main__": interface.launch()