Spaces:
Runtime error
Runtime error
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
import pandas as pd | |
import numpy as np | |
import urllib.request | |
import zipfile | |
import os | |
from torch.utils.data import Dataset, DataLoader | |
from sklearn.model_selection import train_test_split | |
from transformers import BertTokenizer, BertModel | |
from model import SentimentClassifier | |
# Download dataset | |
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip" | |
filename = "sentiment.zip" | |
if not os.path.exists(filename): | |
urllib.request.urlretrieve(url, filename) | |
# Extract dataset | |
with zipfile.ZipFile(filename, 'r') as zip_ref: | |
zip_ref.extractall() | |
# Load dataset | |
filepath_dict = {'yelp': 'sentiment labelled sentences/yelp_labelled.txt', | |
'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt', | |
'imdb': 'sentiment labelled sentences/imdb_labelled.txt'} | |
df_list = [] | |
for source, filepath in filepath_dict.items(): | |
df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t') | |
df['source'] = source | |
df_list.append(df) | |
df = pd.concat(df_list) | |
# Split dataset into train and test sets | |
sentences = df['sentence'].values | |
labels = df['label'].values | |
train_sentences, test_sentences, train_labels, test_labels = train_test_split( | |
sentences, labels, test_size=0.25) | |
# Define tokenizer | |
tokenizer = BertTokenizer.from_pretrained( | |
'bert-base-uncased', do_lower_case=True) | |
# Define dataset | |
class SentimentDataset(Dataset): | |
def __init__(self, sentences, labels, tokenizer, max_len): | |
self.sentences = sentences | |
self.labels = labels | |
self.tokenizer = tokenizer | |
self.max_len = max_len | |
def __len__(self): | |
return len(self.sentences) | |
def __getitem__(self, item): | |
sentence = str(self.sentences[item]) | |
label = self.labels[item] | |
encoding = self.tokenizer.encode_plus( | |
sentence, | |
add_special_tokens=True, | |
max_length=self.max_len, | |
return_token_type_ids=False, | |
pad_to_max_length=True, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
return {'sentence': sentence, | |
'input_ids': encoding['input_ids'].flatten(), | |
'attention_mask': encoding['attention_mask'].flatten(), | |
'label': torch.tensor(label, dtype=torch.long)} | |
# Define model | |
# Set device | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# Define hyperparameters | |
MAX_LEN = 100 | |
BATCH_SIZE = 16 | |
EPOCHS = 5 | |
# Define dataloaders | |
train_dataset = SentimentDataset( | |
train_sentences, train_labels, tokenizer, MAX_LEN) | |
test_dataset = SentimentDataset( | |
test_sentences, test_labels, tokenizer, MAX_LEN) | |
train_dataloader = DataLoader( | |
train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) | |
test_dataloader = DataLoader( | |
test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) | |
# Define model and optimizer | |
model = SentimentClassifier(2) | |
model = model.to(device) | |
optimizer = optim.Adam(model.parameters(), lr=2e-5) | |
# Define loss function | |
criterion = nn.CrossEntropyLoss() | |
# Train model | |
for epoch in range(EPOCHS): | |
print('Epoch:', epoch+1) | |
train_loss = 0 | |
train_acc = 0 | |
model.train() | |
for batch in train_dataloader: | |
input_ids = batch['input_ids'].to(device) | |
attention_mask = batch['attention_mask'].to(device) | |
labels = batch['label'].to(device) | |
optimizer.zero_grad() | |
outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
loss = criterion(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
train_loss += loss.item() | |
train_acc += (outputs.argmax(1) == labels).sum().item() | |
train_loss /= len(train_dataloader) | |
train_acc /= len(train_dataset) | |
print('Train loss:', train_loss, 'Train accuracy:', train_acc) | |
model.eval() | |
test_loss = 0 | |
test_acc = 0 | |
with torch.no_grad(): | |
for batch in test_dataloader: | |
input_ids = batch['input_ids'].to(device) | |
attention_mask = batch['attention_mask'].to(device) | |
labels = batch['label'].to(device) | |
outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
loss = criterion(outputs, labels) | |
test_loss += loss.item() | |
test_acc += (outputs.argmax(1) == labels).sum().item() | |
test_loss /= len(test_dataloader) | |
test_acc /= len(test_dataset) | |
print('Test loss:', test_loss, 'Test accuracy:', test_acc) | |
torch.save(model.cpu().state_dict(), 'sentiment_model.pth') | |