prath commited on
Commit
0645704
Β·
1 Parent(s): eed4fad

Upload 3 files

Browse files
Files changed (3) hide show
  1. model.py +20 -0
  2. sentiment_model.pth +3 -0
  3. train.py +159 -0
model.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import BertModel
4
+
5
+
6
+ class SentimentClassifier(nn.Module):
7
+ def __init__(self, n_classes):
8
+ super(SentimentClassifier, self).__init__()
9
+ self.bert = BertModel.from_pretrained('bert-base-uncased')
10
+ self.drop = nn.Dropout(p=0.3)
11
+ self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
12
+
13
+ def forward(self, input_ids, attention_mask):
14
+ _, pooled_output = self.bert(
15
+ input_ids=input_ids,
16
+ attention_mask=attention_mask,
17
+ return_dict=False
18
+ )
19
+ output = self.drop(pooled_output)
20
+ return self.out(output)
sentiment_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd2fdc804d5fb6efde18817e02f607a6c9f98ce47172f4d1a6c3b578d0bce4a
3
+ size 438019533
train.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import pandas as pd
5
+ import numpy as np
6
+ import urllib.request
7
+ import zipfile
8
+ import os
9
+
10
+ from torch.utils.data import Dataset, DataLoader
11
+ from sklearn.model_selection import train_test_split
12
+ from transformers import BertTokenizer, BertModel
13
+ from model import SentimentClassifier
14
+ # Download dataset
15
+ url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
16
+ filename = "sentiment.zip"
17
+
18
+ if not os.path.exists(filename):
19
+ urllib.request.urlretrieve(url, filename)
20
+
21
+ # Extract dataset
22
+ with zipfile.ZipFile(filename, 'r') as zip_ref:
23
+ zip_ref.extractall()
24
+
25
+ # Load dataset
26
+ filepath_dict = {'yelp': 'sentiment labelled sentences/yelp_labelled.txt',
27
+ 'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
28
+ 'imdb': 'sentiment labelled sentences/imdb_labelled.txt'}
29
+
30
+ df_list = []
31
+ for source, filepath in filepath_dict.items():
32
+ df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
33
+ df['source'] = source
34
+ df_list.append(df)
35
+
36
+ df = pd.concat(df_list)
37
+
38
+ # Split dataset into train and test sets
39
+ sentences = df['sentence'].values
40
+ labels = df['label'].values
41
+ train_sentences, test_sentences, train_labels, test_labels = train_test_split(
42
+ sentences, labels, test_size=0.25)
43
+
44
+ # Define tokenizer
45
+ tokenizer = BertTokenizer.from_pretrained(
46
+ 'bert-base-uncased', do_lower_case=True)
47
+
48
+ # Define dataset
49
+
50
+
51
+ class SentimentDataset(Dataset):
52
+ def __init__(self, sentences, labels, tokenizer, max_len):
53
+ self.sentences = sentences
54
+ self.labels = labels
55
+ self.tokenizer = tokenizer
56
+ self.max_len = max_len
57
+
58
+ def __len__(self):
59
+ return len(self.sentences)
60
+
61
+ def __getitem__(self, item):
62
+ sentence = str(self.sentences[item])
63
+ label = self.labels[item]
64
+
65
+ encoding = self.tokenizer.encode_plus(
66
+ sentence,
67
+ add_special_tokens=True,
68
+ max_length=self.max_len,
69
+ return_token_type_ids=False,
70
+ pad_to_max_length=True,
71
+ return_attention_mask=True,
72
+ return_tensors='pt'
73
+ )
74
+
75
+ return {'sentence': sentence,
76
+ 'input_ids': encoding['input_ids'].flatten(),
77
+ 'attention_mask': encoding['attention_mask'].flatten(),
78
+ 'label': torch.tensor(label, dtype=torch.long)}
79
+
80
+ # Define model
81
+
82
+
83
+ # Set device
84
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
85
+
86
+ # Define hyperparameters
87
+ MAX_LEN = 100
88
+ BATCH_SIZE = 16
89
+ EPOCHS = 5
90
+
91
+ # Define dataloaders
92
+ train_dataset = SentimentDataset(
93
+ train_sentences, train_labels, tokenizer, MAX_LEN)
94
+ test_dataset = SentimentDataset(
95
+ test_sentences, test_labels, tokenizer, MAX_LEN)
96
+ train_dataloader = DataLoader(
97
+ train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
98
+ test_dataloader = DataLoader(
99
+ test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
100
+
101
+ # Define model and optimizer
102
+ model = SentimentClassifier(2)
103
+ model = model.to(device)
104
+ optimizer = optim.Adam(model.parameters(), lr=2e-5)
105
+
106
+ # Define loss function
107
+ criterion = nn.CrossEntropyLoss()
108
+
109
+ # Train model
110
+ for epoch in range(EPOCHS):
111
+ print('Epoch:', epoch+1)
112
+ train_loss = 0
113
+ train_acc = 0
114
+
115
+ model.train()
116
+ for batch in train_dataloader:
117
+ input_ids = batch['input_ids'].to(device)
118
+ attention_mask = batch['attention_mask'].to(device)
119
+ labels = batch['label'].to(device)
120
+
121
+ optimizer.zero_grad()
122
+
123
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
124
+
125
+ loss = criterion(outputs, labels)
126
+ loss.backward()
127
+ optimizer.step()
128
+
129
+ train_loss += loss.item()
130
+ train_acc += (outputs.argmax(1) == labels).sum().item()
131
+
132
+ train_loss /= len(train_dataloader)
133
+ train_acc /= len(train_dataset)
134
+
135
+ print('Train loss:', train_loss, 'Train accuracy:', train_acc)
136
+
137
+ model.eval()
138
+ test_loss = 0
139
+ test_acc = 0
140
+
141
+ with torch.no_grad():
142
+ for batch in test_dataloader:
143
+ input_ids = batch['input_ids'].to(device)
144
+ attention_mask = batch['attention_mask'].to(device)
145
+ labels = batch['label'].to(device)
146
+
147
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
148
+
149
+ loss = criterion(outputs, labels)
150
+
151
+ test_loss += loss.item()
152
+ test_acc += (outputs.argmax(1) == labels).sum().item()
153
+
154
+ test_loss /= len(test_dataloader)
155
+ test_acc /= len(test_dataset)
156
+
157
+ print('Test loss:', test_loss, 'Test accuracy:', test_acc)
158
+
159
+ torch.save(model.cpu().state_dict(), 'sentiment_model.pth')