Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Hugging Face Space App with INT8 Quantization""" | |
import os | |
import gradio as gr | |
from huggingface_hub import login | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# 登錄 Hugging Face,使用訪問令牌進行身份驗證 | |
HF_TOKEN = os.getenv("HF_TOKEN") # 從環境變數中獲取訪問令牌 | |
if not HF_TOKEN: | |
raise ValueError( | |
"未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。" | |
) | |
login(HF_TOKEN) # 使用訪問令牌進行身份驗證 | |
# 加載量化的 Llama-2-13b-chat-hf 模型 | |
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" | |
# 啟用量化選項 | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
device_map="auto", # 自動分配設備(CPU/GPU) | |
load_in_8bit=True, # 啟用 INT8 量化 | |
use_auth_token=HF_TOKEN # 使用 Hugging Face 訪問令牌 | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN) | |
# 定義推理函數 | |
def generate_text(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
inputs.input_ids, | |
max_length=200, | |
num_beams=5, | |
repetition_penalty=1.2, | |
early_stopping=True | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# 使用 Gradio 構建界面 | |
interface = gr.Interface( | |
fn=generate_text, | |
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), | |
outputs="text", | |
title="Llama 2 Text Generator (INT8 Quantized)", | |
description="Generate text using the INT8-quantized Llama-2-13b-chat-hf model hosted on Hugging Face Spaces." | |
) | |
# 啟動應用 | |
if __name__ == "__main__": | |
interface.launch() | |