File size: 1,622 Bytes
55bc8fa
170c6af
f975c73
8d36e9b
55bc8fa
8d36e9b
170c6af
b306d99
8d36e9b
170c6af
8d36e9b
 
 
 
 
170c6af
8d36e9b
170c6af
8d36e9b
170c6af
57fc729
170c6af
57fc729
 
170c6af
 
 
57fc729
170c6af
55bc8fa
 
 
 
 
 
 
 
 
 
 
 
 
3e37a08
55bc8fa
 
 
 
170c6af
 
f975c73
 
55bc8fa
f975c73
3e37a08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""Hugging Face Space App with CPU Quantization"""

import os
import gradio as gr
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 登錄 Hugging Face,使用訪問令牌進行身份驗證
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError(
        "未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。"
    )

login(HF_TOKEN)

# 配置 4-bit 量化
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# 加載量化模型
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)

# 定義推理函數
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_length=200,
        num_beams=5,
        repetition_penalty=1.2,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 使用 Gradio 構建界面
interface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."),
    outputs="text",
    title="Llama 2 Text Generator (CPU Quantized)",
    description="Generate text using the Llama-2-13b-chat-hf model with CPU quantization hosted on Hugging Face Spaces."
)

# 啟動應用
if __name__ == "__main__":
    interface.launch()