P2A-test-NV / laser /source /sent_classif.py
KuangDW
Add laser2.spm using Git LFS
05d3571
#!/usr/bin/python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# Simple MLP classifier for sentence embeddings
import argparse
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
################################################
def LoadData(bdir, dfn, lfn, dim=1024, bsize=32, shuffle=False, quiet=False):
x = np.fromfile(bdir + dfn, dtype=np.float32, count=-1)
x.resize(x.shape[0] // dim, dim)
lbl = np.loadtxt(bdir + lfn, dtype=np.int32)
lbl.reshape(lbl.shape[0], 1)
if not quiet:
print(' - read {:d}x{:d} elements in {:s}'.format(x.shape[0], x.shape[1], dfn))
print(' - read {:d} labels [{:d},{:d}] in {:s}'
.format(lbl.shape[0], lbl.min(), lbl.max(), lfn))
D = data_utils.TensorDataset(torch.from_numpy(x), torch.from_numpy(lbl))
loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
return loader
################################################
class Net(nn.Module):
def __init__(self, idim=1024, odim=2, nhid=None,
dropout=0.0, gpu=0, activation='TANH'):
super(Net, self).__init__()
self.gpu = gpu
modules = []
modules = []
print(' - mlp {:d}'.format(idim), end='')
if len(nhid) > 0:
if dropout > 0:
modules.append(nn.Dropout(p=dropout))
nprev = idim
for nh in nhid:
if nh > 0:
modules.append(nn.Linear(nprev, nh))
nprev = nh
if activation == 'TANH':
modules.append(nn.Tanh())
print('-{:d}t'.format(nh), end='')
elif activation == 'RELU':
modules.append(nn.ReLU())
print('-{:d}r'.format(nh), end='')
else:
raise Exception('Unrecognized activation {activation}')
if dropout > 0:
modules.append(nn.Dropout(p=dropout))
modules.append(nn.Linear(nprev, odim))
print('-{:d}, dropout={:.1f}'.format(odim, dropout))
else:
modules.append(nn.Linear(idim, odim))
print(' - mlp %d-%d'.format(idim, odim))
self.mlp = nn.Sequential(*modules)
# Softmax is included CrossEntropyLoss !
if self.gpu >= 0:
self.mlp = self.mlp.cuda()
def forward(self, x):
return self.mlp(x)
def TestCorpus(self, dset, name=' Dev', nlbl=4):
correct = 0
total = 0
self.mlp.train(mode=False)
corr = np.zeros(nlbl, dtype=np.int32)
for data in dset:
X, Y = data
Y = Y.long()
if self.gpu >= 0:
X = X.cuda()
Y = Y.cuda()
outputs = self.mlp(X)
_, predicted = torch.max(outputs.data, 1)
total += Y.size(0)
correct += (predicted == Y).int().sum()
for i in range(nlbl):
corr[i] += (predicted == i).int().sum()
print(' | {:4s}: {:5.2f}%'
.format(name, 100.0 * correct.float() / total), end='')
print(' | classes:', end='')
for i in range(nlbl):
print(' {:5.2f}'.format(100.0 * corr[i] / total), end='')
return correct, total
################################################
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Simple sentence classifier")
# Data
parser.add_argument(
'--base-dir', '-b', type=str, required=True, metavar='PATH',
help="Directory with all the data files)")
parser.add_argument(
'--save', '-s', type=str, required=False, metavar='PATH', default="",
help="File in which to save best network")
parser.add_argument(
'--train', '-t', type=str, required=True, metavar='STR',
help="Name of training corpus")
parser.add_argument(
'--train-labels', '-T', type=str, required=True, metavar='STR',
help="Name of training corpus (labels)")
parser.add_argument(
'--dev', '-d', type=str, required=True, metavar='STR',
help="Name of development corpus")
parser.add_argument(
'--dev-labels', '-D', type=str, required=True, metavar='STR',
help="Name of development corpus (labels)")
parser.add_argument(
'--test', '-e', type=str, required=True, metavar='STR',
help="Name of test corpus without language extension")
parser.add_argument(
'--test-labels', '-E', type=str, required=True, metavar='STR',
help="Name of test corpus without language extension (labels)")
parser.add_argument(
'--lang', '-L', nargs='+', default=None,
help="List of languages to test on")
# network definition
parser.add_argument(
"--dim", "-m", type=int, default=1024,
help="Dimension of sentence embeddings")
parser.add_argument(
'--nhid', '-n', type=int, default=[0], nargs='+',
help="List of hidden layer(s) dimensions")
parser.add_argument(
"--nb-classes", "-c", type=int, default=2,
help="Number of output classes")
parser.add_argument(
'--dropout', '-o', type=float, default=0.0, metavar='FLOAT',
help="Value of dropout")
parser.add_argument(
'--nepoch', '-N', type=int, default=100, metavar='INT',
help="Number of epochs")
parser.add_argument(
'--bsize', '-B', type=int, default=128, metavar='INT',
help="Batch size")
parser.add_argument(
'--seed', '-S', type=int, default=123456789, metavar='INT',
help="Initial random seed")
parser.add_argument(
'--lr', type=float, default=0.001, metavar='FLOAT',
help='Learning rate')
parser.add_argument(
'--wdecay', type=float, default=0.0, metavar='FLOAT',
help='Weight decay')
parser.add_argument(
'--gpu', '-g', type=int, default=-1, metavar='INT',
help="GPU id (-1 for CPU)")
args = parser.parse_args()
print(' - base directory: {}'.format(args.base_dir))
args.base_dir = args.base_dir + "/"
train_loader = LoadData(args.base_dir, args.train, args.train_labels,
dim=args.dim, bsize=args.bsize, shuffle=True)
dev_loader = LoadData(args.base_dir, args.dev, args.dev_labels,
dim=args.dim, bsize=args.bsize, shuffle=False)
# set GPU and random seed
torch.cuda.set_device(args.gpu)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
print(" - setting seed to %d" % args.seed)
# create network
net = Net(idim=args.dim, odim=args.nb_classes,
nhid=args.nhid, dropout=args.dropout, gpu=args.gpu)
if args.gpu >= 0:
criterion = nn.CrossEntropyLoss().cuda()
else:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(net.parameters(), weight_decay=0.0)
# default: pytorch/optim/adam.py
# Py0.4: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
# Py1.0: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
optimizer = optim.Adam(net.parameters(),
lr=args.lr,
weight_decay=args.wdecay,
betas=(0.9, 0.999),
eps=1e-8,
amsgrad=False)
corr_best = 0
# loop multiple times over the dataset
for epoch in range(args.nepoch):
loss_epoch = 0.0
print('Ep {:4d}'.format(epoch), end='')
# for inputs, labels in train_loader:
for i, data in enumerate(train_loader, 0):
# get the inputs
inputs, labels = data
labels = labels.long()
if args.gpu >= 0:
inputs = inputs.cuda()
labels = labels.cuda()
# zero the parameter gradients
net.zero_grad()
# forward + backward + optimize
net.train(mode=True)
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
loss_epoch += loss.item()
print(' | loss {:e}'.format(loss_epoch), end='')
corr, nbex = net.TestCorpus(dev_loader, 'Dev')
if corr >= corr_best:
print(' | saved')
corr_best = corr
net_best = copy.deepcopy(net)
else:
print('')
if 'net_best' in globals():
if args.save != '':
torch.save(net_best.cpu(), args.save)
print('Best Dev: {:d} = {:5.2f}%'
.format(corr_best, 100.0 * corr_best.float() / nbex))
if args.gpu >= 0:
net_best = net_best.cuda()
# test on (several) languages
for l in args.lang:
test_loader = LoadData(args.base_dir, args.test + '.' + l,
args.test_labels + '.' + l,
dim=args.dim, bsize=args.bsize,
shuffle=False, quiet=True)
print('Ep best | Eval Test lang {:s}'.format(l), end='')
net_best.TestCorpus(test_loader, 'Test')
print('')