File size: 3,215 Bytes
58f09d1
bbf9c8b
627db6a
a4fc148
 
58f09d1
627db6a
 
 
 
a4fc148
58f09d1
627db6a
 
 
a4fc148
 
 
 
627db6a
a4fc148
 
 
627db6a
a4fc148
627db6a
 
a4fc148
627db6a
a4fc148
 
bbf9c8b
627db6a
 
a4fc148
627db6a
a4fc148
bbf9c8b
627db6a
bbf9c8b
627db6a
a4fc148
 
bbf9c8b
 
 
 
 
 
 
 
 
a4fc148
bbf9c8b
 
 
 
a4fc148
627db6a
a4fc148
627db6a
 
 
a4fc148
 
627db6a
a4fc148
bbf9c8b
a4fc148
 
bbf9c8b
 
627db6a
 
bbf9c8b
627db6a
bbf9c8b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import threading
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# GPU๊ฐ€ ์•„๋‹Œ CPU์—์„œ ์‹คํ–‰ํ•˜๋„๋ก ์„ค์ •
device = torch.device("cpu")

# IMDb ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋”ฉ
dataset = load_dataset("imdb")

# ๋ฐ์ดํ„ฐ์…‹์˜ ํ…์ŠคํŠธ ์ปฌ๋Ÿผ ์ž๋™ ๊ฐ์ง€
text_column = dataset["train"].column_names[0]  # ๊ธฐ๋ณธ์ ์œผ๋กœ "text"์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ

# ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)  # ๋ชจ๋ธ์„ CPU๋กœ ์ด๋™

# ๋ฐ์ดํ„ฐ์…‹์„ ๋ชจ๋ธ์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌ
def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True)

tokenized_train_datasets = dataset["train"].map(tokenize_function, batched=True, batch_size=None, remove_columns=[text_column])
tokenized_test_datasets = dataset["test"].map(tokenize_function, batched=True, batch_size=None, remove_columns=[text_column])

# ํ›ˆ๋ จ ์„ค์ • (GPU ์‚ฌ์šฉ ์•ˆ ํ•จ)
training_args = TrainingArguments(
    output_dir="./results",           # ๊ฒฐ๊ณผ ์ €์žฅ ๊ฒฝ๋กœ
    num_train_epochs=1,               # ํ›ˆ๋ จ ์—ํญ ์ˆ˜ 1๋กœ ์„ค์ • (๋น ๋ฅด๊ฒŒ ํ…Œ์ŠคํŠธ)
    per_device_train_batch_size=4,    # ๋ฐฐ์น˜ ํฌ๊ธฐ ์ค„์ด๊ธฐ (CPU์—์„œ๋Š” ์ž‘์€ ๊ฐ’ ์ถ”์ฒœ)
    per_device_eval_batch_size=4,     # ๋ฐฐ์น˜ ํฌ๊ธฐ ์ค„์ด๊ธฐ
    evaluation_strategy="epoch",      # ์—ํญ๋งˆ๋‹ค ๊ฒ€์ฆ
    save_strategy="epoch",
    logging_dir="./logs",             # ๋กœ๊ทธ ์ €์žฅ ๊ฒฝ๋กœ
    logging_steps=100,                # 100 ์Šคํ…๋งˆ๋‹ค ๋กœ๊ทธ ์ถœ๋ ฅ
    report_to="none",                 # ํ—ˆ๊น…ํŽ˜์ด์Šค ์—…๋กœ๋“œ ์‹œ ๋กœ๊น… ๋น„ํ™œ์„ฑํ™”
    load_best_model_at_end=True,      # ์ตœ์ƒ์˜ ๋ชจ๋ธ๋กœ ์ข…๋ฃŒ
    no_cuda=True                      # โŒ GPU ์‚ฌ์šฉํ•˜์ง€ ์•Š๋„๋ก ์„ค์ •
)

# ํ›ˆ๋ จ ํ•จ์ˆ˜
def train_model():
    trainer = Trainer(
        model=model,                       # ํ›ˆ๋ จํ•  ๋ชจ๋ธ
        args=training_args,                # ํ›ˆ๋ จ ์ธ์ž
        train_dataset=tokenized_train_datasets,  # ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ์…‹
        eval_dataset=tokenized_test_datasets,    # ํ‰๊ฐ€ ๋ฐ์ดํ„ฐ์…‹
    )
    trainer.train()

# ํ›ˆ๋ จ์„ ๋ณ„๋„์˜ ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰
def start_training():
    train_thread = threading.Thread(target=train_model)
    train_thread.start()

# ํ…์ŠคํŠธ ๋ถ„๋ฅ˜ ํ•จ์ˆ˜ (CPU์—์„œ ์‹คํ–‰)
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():  # ๋ถˆํ•„์š”ํ•œ ์—ฐ์‚ฐ ๋ฐฉ์ง€
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()
    return str(predicted_class)  # Gradio์—์„œ ๋ฌธ์ž์—ด ๋ฐ˜ํ™˜์ด ๋” ์•ˆ์ •์ 

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
demo = gr.Interface(fn=classify_text, inputs="text", outputs="text")

# ํ›ˆ๋ จ ์‹œ์ž‘๊ณผ Gradio UI ์‹คํ–‰
def launch_app():
    start_training()  # ํ›ˆ๋ จ ์‹œ์ž‘
    demo.launch()     # Gradio UI ์‹คํ–‰

# ํ—ˆ๊น…ํŽ˜์ด์Šค Spaces์— ์—…๋กœ๋“œํ•  ๋•Œ ์‹คํ–‰
if __name__ == "__main__":
    launch_app()