Spaces:

prath
/

sentiment-analysis

Runtime error

App Files Files Community

prath commited on Apr 16, 2023

Commit

0645704

1 Parent(s): eed4fad

Upload 3 files

Browse files

Files changed (3) hide show

model.py +20 -0
sentiment_model.pth +3 -0
train.py +159 -0

model.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn as nn
+from transformers import BertModel
+class SentimentClassifier(nn.Module):
+    def __init__(self, n_classes):
+        super(SentimentClassifier, self).__init__()
+        self.bert = BertModel.from_pretrained('bert-base-uncased')
+        self.drop = nn.Dropout(p=0.3)
+        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
+    def forward(self, input_ids, attention_mask):
+        _, pooled_output = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=False
+        )
+        output = self.drop(pooled_output)
+        return self.out(output)

sentiment_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd2fdc804d5fb6efde18817e02f607a6c9f98ce47172f4d1a6c3b578d0bce4a
+size 438019533

train.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import pandas as pd
+import numpy as np
+import urllib.request
+import zipfile
+import os
+from torch.utils.data import Dataset, DataLoader
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertModel
+from model import SentimentClassifier
+# Download dataset
+url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
+filename = "sentiment.zip"
+if not os.path.exists(filename):
+    urllib.request.urlretrieve(url, filename)
+# Extract dataset
+with zipfile.ZipFile(filename, 'r') as zip_ref:
+    zip_ref.extractall()
+# Load dataset
+filepath_dict = {'yelp':   'sentiment labelled sentences/yelp_labelled.txt',
+                 'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
+                 'imdb':   'sentiment labelled sentences/imdb_labelled.txt'}
+df_list = []
+for source, filepath in filepath_dict.items():
+    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
+    df['source'] = source
+    df_list.append(df)
+df = pd.concat(df_list)
+# Split dataset into train and test sets
+sentences = df['sentence'].values
+labels = df['label'].values
+train_sentences, test_sentences, train_labels, test_labels = train_test_split(
+    sentences, labels, test_size=0.25)
+# Define tokenizer
+tokenizer = BertTokenizer.from_pretrained(
+    'bert-base-uncased', do_lower_case=True)
+# Define dataset
+class SentimentDataset(Dataset):
+    def __init__(self, sentences, labels, tokenizer, max_len):
+        self.sentences = sentences
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.sentences)
+    def __getitem__(self, item):
+        sentence = str(self.sentences[item])
+        label = self.labels[item]
+        encoding = self.tokenizer.encode_plus(
+            sentence,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            return_token_type_ids=False,
+            pad_to_max_length=True,
+            return_attention_mask=True,
+            return_tensors='pt'
+        )
+        return {'sentence': sentence,
+                'input_ids': encoding['input_ids'].flatten(),
+                'attention_mask': encoding['attention_mask'].flatten(),
+                'label': torch.tensor(label, dtype=torch.long)}
+# Define model
+# Set device
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Define hyperparameters
+MAX_LEN = 100
+BATCH_SIZE = 16
+EPOCHS = 5
+# Define dataloaders
+train_dataset = SentimentDataset(
+    train_sentences, train_labels, tokenizer, MAX_LEN)
+test_dataset = SentimentDataset(
+    test_sentences, test_labels, tokenizer, MAX_LEN)
+train_dataloader = DataLoader(
+    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+test_dataloader = DataLoader(
+    test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+# Define model and optimizer
+model = SentimentClassifier(2)
+model = model.to(device)
+optimizer = optim.Adam(model.parameters(), lr=2e-5)
+# Define loss function
+criterion = nn.CrossEntropyLoss()
+# Train model
+for epoch in range(EPOCHS):
+    print('Epoch:', epoch+1)
+    train_loss = 0
+    train_acc = 0
+    model.train()
+    for batch in train_dataloader:
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['label'].to(device)
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        train_loss += loss.item()
+        train_acc += (outputs.argmax(1) == labels).sum().item()
+    train_loss /= len(train_dataloader)
+    train_acc /= len(train_dataset)
+    print('Train loss:', train_loss, 'Train accuracy:', train_acc)
+    model.eval()
+    test_loss = 0
+    test_acc = 0
+    with torch.no_grad():
+        for batch in test_dataloader:
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['label'].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            loss = criterion(outputs, labels)
+            test_loss += loss.item()
+            test_acc += (outputs.argmax(1) == labels).sum().item()
+    test_loss /= len(test_dataloader)
+    test_acc /= len(test_dataset)
+    print('Test loss:', test_loss, 'Test accuracy:', test_acc)
+torch.save(model.cpu().state_dict(), 'sentiment_model.pth')