import torch import torch.nn as nn import torch.optim as optim import pandas as pd import numpy as np import urllib.request import zipfile import os from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from transformers import BertTokenizer, BertModel from model import SentimentClassifier # Download dataset url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip" filename = "sentiment.zip" if not os.path.exists(filename): urllib.request.urlretrieve(url, filename) # Extract dataset with zipfile.ZipFile(filename, 'r') as zip_ref: zip_ref.extractall() # Load dataset filepath_dict = {'yelp': 'sentiment labelled sentences/yelp_labelled.txt', 'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt', 'imdb': 'sentiment labelled sentences/imdb_labelled.txt'} df_list = [] for source, filepath in filepath_dict.items(): df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t') df['source'] = source df_list.append(df) df = pd.concat(df_list) # Split dataset into train and test sets sentences = df['sentence'].values labels = df['label'].values train_sentences, test_sentences, train_labels, test_labels = train_test_split( sentences, labels, test_size=0.25) # Define tokenizer tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True) # Define dataset class SentimentDataset(Dataset): def __init__(self, sentences, labels, tokenizer, max_len): self.sentences = sentences self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.sentences) def __getitem__(self, item): sentence = str(self.sentences[item]) label = self.labels[item] encoding = self.tokenizer.encode_plus( sentence, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt' ) return {'sentence': sentence, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label, dtype=torch.long)} # Define model # Set device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define hyperparameters MAX_LEN = 100 BATCH_SIZE = 16 EPOCHS = 5 # Define dataloaders train_dataset = SentimentDataset( train_sentences, train_labels, tokenizer, MAX_LEN) test_dataset = SentimentDataset( test_sentences, test_labels, tokenizer, MAX_LEN) train_dataloader = DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) test_dataloader = DataLoader( test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) # Define model and optimizer model = SentimentClassifier(2) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=2e-5) # Define loss function criterion = nn.CrossEntropyLoss() # Train model for epoch in range(EPOCHS): print('Epoch:', epoch+1) train_loss = 0 train_acc = 0 model.train() for batch in train_dataloader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) optimizer.zero_grad() outputs = model(input_ids=input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.item() train_acc += (outputs.argmax(1) == labels).sum().item() train_loss /= len(train_dataloader) train_acc /= len(train_dataset) print('Train loss:', train_loss, 'Train accuracy:', train_acc) model.eval() test_loss = 0 test_acc = 0 with torch.no_grad(): for batch in test_dataloader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels) test_loss += loss.item() test_acc += (outputs.argmax(1) == labels).sum().item() test_loss /= len(test_dataloader) test_acc /= len(test_dataset) print('Test loss:', test_loss, 'Test accuracy:', test_acc) torch.save(model.cpu().state_dict(), 'sentiment_model.pth')