Spaces:
Runtime error
Runtime error
File size: 1,622 Bytes
55bc8fa 170c6af f975c73 8d36e9b 55bc8fa 8d36e9b 170c6af b306d99 8d36e9b 170c6af 8d36e9b 170c6af 8d36e9b 170c6af 8d36e9b 170c6af 57fc729 170c6af 57fc729 170c6af 57fc729 170c6af 55bc8fa 3e37a08 55bc8fa 170c6af f975c73 55bc8fa f975c73 3e37a08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# -*- coding: utf-8 -*-
"""Hugging Face Space App with CPU Quantization"""
import os
import gradio as gr
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# 登錄 Hugging Face,使用訪問令牌進行身份驗證
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
raise ValueError(
"未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。"
)
login(HF_TOKEN)
# 配置 4-bit 量化
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
# 加載量化模型
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=quantization_config,
device_map="auto",
token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
# 定義推理函數
def generate_text(prompt):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
inputs.input_ids,
max_length=200,
num_beams=5,
repetition_penalty=1.2,
early_stopping=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# 使用 Gradio 構建界面
interface = gr.Interface(
fn=generate_text,
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."),
outputs="text",
title="Llama 2 Text Generator (CPU Quantized)",
description="Generate text using the Llama-2-13b-chat-hf model with CPU quantization hosted on Hugging Face Spaces."
)
# 啟動應用
if __name__ == "__main__":
interface.launch()
|