Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- model.py +20 -0
- sentiment_model.pth +3 -0
- train.py +159 -0
model.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import BertModel
|
4 |
+
|
5 |
+
|
6 |
+
class SentimentClassifier(nn.Module):
|
7 |
+
def __init__(self, n_classes):
|
8 |
+
super(SentimentClassifier, self).__init__()
|
9 |
+
self.bert = BertModel.from_pretrained('bert-base-uncased')
|
10 |
+
self.drop = nn.Dropout(p=0.3)
|
11 |
+
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
|
12 |
+
|
13 |
+
def forward(self, input_ids, attention_mask):
|
14 |
+
_, pooled_output = self.bert(
|
15 |
+
input_ids=input_ids,
|
16 |
+
attention_mask=attention_mask,
|
17 |
+
return_dict=False
|
18 |
+
)
|
19 |
+
output = self.drop(pooled_output)
|
20 |
+
return self.out(output)
|
sentiment_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fd2fdc804d5fb6efde18817e02f607a6c9f98ce47172f4d1a6c3b578d0bce4a
|
3 |
+
size 438019533
|
train.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import urllib.request
|
7 |
+
import zipfile
|
8 |
+
import os
|
9 |
+
|
10 |
+
from torch.utils.data import Dataset, DataLoader
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
from transformers import BertTokenizer, BertModel
|
13 |
+
from model import SentimentClassifier
|
14 |
+
# Download dataset
|
15 |
+
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
|
16 |
+
filename = "sentiment.zip"
|
17 |
+
|
18 |
+
if not os.path.exists(filename):
|
19 |
+
urllib.request.urlretrieve(url, filename)
|
20 |
+
|
21 |
+
# Extract dataset
|
22 |
+
with zipfile.ZipFile(filename, 'r') as zip_ref:
|
23 |
+
zip_ref.extractall()
|
24 |
+
|
25 |
+
# Load dataset
|
26 |
+
filepath_dict = {'yelp': 'sentiment labelled sentences/yelp_labelled.txt',
|
27 |
+
'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
|
28 |
+
'imdb': 'sentiment labelled sentences/imdb_labelled.txt'}
|
29 |
+
|
30 |
+
df_list = []
|
31 |
+
for source, filepath in filepath_dict.items():
|
32 |
+
df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
|
33 |
+
df['source'] = source
|
34 |
+
df_list.append(df)
|
35 |
+
|
36 |
+
df = pd.concat(df_list)
|
37 |
+
|
38 |
+
# Split dataset into train and test sets
|
39 |
+
sentences = df['sentence'].values
|
40 |
+
labels = df['label'].values
|
41 |
+
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
|
42 |
+
sentences, labels, test_size=0.25)
|
43 |
+
|
44 |
+
# Define tokenizer
|
45 |
+
tokenizer = BertTokenizer.from_pretrained(
|
46 |
+
'bert-base-uncased', do_lower_case=True)
|
47 |
+
|
48 |
+
# Define dataset
|
49 |
+
|
50 |
+
|
51 |
+
class SentimentDataset(Dataset):
|
52 |
+
def __init__(self, sentences, labels, tokenizer, max_len):
|
53 |
+
self.sentences = sentences
|
54 |
+
self.labels = labels
|
55 |
+
self.tokenizer = tokenizer
|
56 |
+
self.max_len = max_len
|
57 |
+
|
58 |
+
def __len__(self):
|
59 |
+
return len(self.sentences)
|
60 |
+
|
61 |
+
def __getitem__(self, item):
|
62 |
+
sentence = str(self.sentences[item])
|
63 |
+
label = self.labels[item]
|
64 |
+
|
65 |
+
encoding = self.tokenizer.encode_plus(
|
66 |
+
sentence,
|
67 |
+
add_special_tokens=True,
|
68 |
+
max_length=self.max_len,
|
69 |
+
return_token_type_ids=False,
|
70 |
+
pad_to_max_length=True,
|
71 |
+
return_attention_mask=True,
|
72 |
+
return_tensors='pt'
|
73 |
+
)
|
74 |
+
|
75 |
+
return {'sentence': sentence,
|
76 |
+
'input_ids': encoding['input_ids'].flatten(),
|
77 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
78 |
+
'label': torch.tensor(label, dtype=torch.long)}
|
79 |
+
|
80 |
+
# Define model
|
81 |
+
|
82 |
+
|
83 |
+
# Set device
|
84 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
85 |
+
|
86 |
+
# Define hyperparameters
|
87 |
+
MAX_LEN = 100
|
88 |
+
BATCH_SIZE = 16
|
89 |
+
EPOCHS = 5
|
90 |
+
|
91 |
+
# Define dataloaders
|
92 |
+
train_dataset = SentimentDataset(
|
93 |
+
train_sentences, train_labels, tokenizer, MAX_LEN)
|
94 |
+
test_dataset = SentimentDataset(
|
95 |
+
test_sentences, test_labels, tokenizer, MAX_LEN)
|
96 |
+
train_dataloader = DataLoader(
|
97 |
+
train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
|
98 |
+
test_dataloader = DataLoader(
|
99 |
+
test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
|
100 |
+
|
101 |
+
# Define model and optimizer
|
102 |
+
model = SentimentClassifier(2)
|
103 |
+
model = model.to(device)
|
104 |
+
optimizer = optim.Adam(model.parameters(), lr=2e-5)
|
105 |
+
|
106 |
+
# Define loss function
|
107 |
+
criterion = nn.CrossEntropyLoss()
|
108 |
+
|
109 |
+
# Train model
|
110 |
+
for epoch in range(EPOCHS):
|
111 |
+
print('Epoch:', epoch+1)
|
112 |
+
train_loss = 0
|
113 |
+
train_acc = 0
|
114 |
+
|
115 |
+
model.train()
|
116 |
+
for batch in train_dataloader:
|
117 |
+
input_ids = batch['input_ids'].to(device)
|
118 |
+
attention_mask = batch['attention_mask'].to(device)
|
119 |
+
labels = batch['label'].to(device)
|
120 |
+
|
121 |
+
optimizer.zero_grad()
|
122 |
+
|
123 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
124 |
+
|
125 |
+
loss = criterion(outputs, labels)
|
126 |
+
loss.backward()
|
127 |
+
optimizer.step()
|
128 |
+
|
129 |
+
train_loss += loss.item()
|
130 |
+
train_acc += (outputs.argmax(1) == labels).sum().item()
|
131 |
+
|
132 |
+
train_loss /= len(train_dataloader)
|
133 |
+
train_acc /= len(train_dataset)
|
134 |
+
|
135 |
+
print('Train loss:', train_loss, 'Train accuracy:', train_acc)
|
136 |
+
|
137 |
+
model.eval()
|
138 |
+
test_loss = 0
|
139 |
+
test_acc = 0
|
140 |
+
|
141 |
+
with torch.no_grad():
|
142 |
+
for batch in test_dataloader:
|
143 |
+
input_ids = batch['input_ids'].to(device)
|
144 |
+
attention_mask = batch['attention_mask'].to(device)
|
145 |
+
labels = batch['label'].to(device)
|
146 |
+
|
147 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
148 |
+
|
149 |
+
loss = criterion(outputs, labels)
|
150 |
+
|
151 |
+
test_loss += loss.item()
|
152 |
+
test_acc += (outputs.argmax(1) == labels).sum().item()
|
153 |
+
|
154 |
+
test_loss /= len(test_dataloader)
|
155 |
+
test_acc /= len(test_dataset)
|
156 |
+
|
157 |
+
print('Test loss:', test_loss, 'Test accuracy:', test_acc)
|
158 |
+
|
159 |
+
torch.save(model.cpu().state_dict(), 'sentiment_model.pth')
|