Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
from fairseq import utils | |
from fairseq.models import ( | |
FairseqLanguageModel, | |
register_model, | |
register_model_architecture, | |
) | |
from fairseq.models.lightconv import Embedding, LightConvDecoder | |
from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder | |
class LightConvLanguageModel(FairseqLanguageModel): | |
def __init__(self, decoder): | |
super().__init__(decoder) | |
def add_args(parser): | |
"""Add model-specific arguments to the parser.""" | |
parser.add_argument( | |
"--dropout", | |
default=0.1, | |
type=float, | |
metavar="D", | |
help="dropout probability", | |
) | |
parser.add_argument( | |
"--attention-dropout", | |
default=0.0, | |
type=float, | |
metavar="D", | |
help="dropout probability for attention weights", | |
) | |
parser.add_argument( | |
"--relu-dropout", | |
default=0.0, | |
type=float, | |
metavar="D", | |
help="dropout probability after ReLU in FFN", | |
) | |
parser.add_argument( | |
"--input-dropout", | |
type=float, | |
metavar="D", | |
help="dropout probability of the inputs", | |
) | |
parser.add_argument( | |
"--decoder-embed-dim", | |
type=int, | |
metavar="N", | |
help="decoder embedding dimension", | |
) | |
parser.add_argument( | |
"--decoder-output-dim", | |
type=int, | |
metavar="N", | |
help="decoder output dimension", | |
) | |
parser.add_argument( | |
"--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" | |
) | |
parser.add_argument( | |
"--decoder-ffn-embed-dim", | |
type=int, | |
metavar="N", | |
help="decoder embedding dimension for FFN", | |
) | |
parser.add_argument( | |
"--decoder-layers", type=int, metavar="N", help="num decoder layers" | |
) | |
parser.add_argument( | |
"--decoder-attention-heads", | |
type=int, | |
metavar="N", | |
help="num decoder attention heads or LightConv/DynamicConv heads", | |
) | |
parser.add_argument( | |
"--decoder-normalize-before", | |
default=False, | |
action="store_true", | |
help="apply layernorm before each decoder block", | |
) | |
parser.add_argument( | |
"--adaptive-softmax-cutoff", | |
metavar="EXPR", | |
help="comma separated list of adaptive softmax cutoff points. " | |
"Must be used with adaptive_loss criterion", | |
) | |
parser.add_argument( | |
"--adaptive-softmax-dropout", | |
type=float, | |
metavar="D", | |
help="sets adaptive softmax dropout for the tail projections", | |
) | |
parser.add_argument( | |
"--adaptive-softmax-factor", | |
type=float, | |
metavar="N", | |
help="adaptive input factor", | |
) | |
parser.add_argument( | |
"--no-token-positional-embeddings", | |
default=False, | |
action="store_true", | |
help="if set, disables positional embeddings (outside self attention)", | |
) | |
parser.add_argument( | |
"--share-decoder-input-output-embed", | |
default=False, | |
action="store_true", | |
help="share decoder input and output embeddings", | |
) | |
parser.add_argument( | |
"--character-embeddings", | |
default=False, | |
action="store_true", | |
help="if set, uses character embedding convolutions to produce token embeddings", | |
) | |
parser.add_argument( | |
"--character-filters", | |
type=str, | |
metavar="LIST", | |
default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", | |
help="size of character embeddings", | |
) | |
parser.add_argument( | |
"--character-embedding-dim", | |
type=int, | |
metavar="N", | |
default=4, | |
help="size of character embeddings", | |
) | |
parser.add_argument( | |
"--char-embedder-highway-layers", | |
type=int, | |
metavar="N", | |
default=2, | |
help="number of highway layers for character token embeddder", | |
) | |
parser.add_argument( | |
"--adaptive-input", | |
default=False, | |
action="store_true", | |
help="if set, uses adaptive input", | |
) | |
parser.add_argument( | |
"--adaptive-input-factor", | |
type=float, | |
metavar="N", | |
help="adaptive input factor", | |
) | |
parser.add_argument( | |
"--adaptive-input-cutoff", | |
metavar="EXPR", | |
help="comma separated list of adaptive input cutoff points.", | |
) | |
parser.add_argument( | |
"--tie-adaptive-weights", | |
action="store_true", | |
help="if set, ties the weights of adaptive softmax and adaptive input", | |
) | |
parser.add_argument( | |
"--tie-adaptive-proj", | |
action="store_true", | |
help="if set, ties the projection weights of adaptive softmax and adaptive input", | |
) | |
parser.add_argument( | |
"--decoder-learned-pos", | |
action="store_true", | |
help="use learned positional embeddings in the decoder", | |
) | |
"""LightConv and DynamicConv arguments""" | |
parser.add_argument( | |
"--decoder-kernel-size-list", | |
type=lambda x: utils.eval_str_list(x, int), | |
help='list of kernel size (default: "[3,7,15,31,31,31]")', | |
) | |
parser.add_argument( | |
"--decoder-glu", type=utils.eval_bool, help="glu after in proj" | |
) | |
parser.add_argument( | |
"--decoder-conv-type", | |
default="dynamic", | |
type=str, | |
choices=["dynamic", "lightweight"], | |
help="type of convolution", | |
) | |
parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) | |
parser.add_argument( | |
"--weight-dropout", | |
type=float, | |
metavar="D", | |
help="dropout probability for conv weights", | |
) | |
def build_model(cls, args, task): | |
"""Build a new model instance.""" | |
# make sure all arguments are present in older models | |
base_lm_architecture(args) | |
if getattr(args, "max_source_positions", None) is None: | |
args.max_source_positions = args.tokens_per_sample | |
if getattr(args, "max_target_positions", None) is None: | |
args.max_target_positions = args.tokens_per_sample | |
if args.character_embeddings: | |
embed_tokens = CharacterTokenEmbedder( | |
task.dictionary, | |
eval(args.character_filters), | |
args.character_embedding_dim, | |
args.decoder_embed_dim, | |
args.char_embedder_highway_layers, | |
) | |
elif args.adaptive_input: | |
embed_tokens = AdaptiveInput( | |
len(task.dictionary), | |
task.dictionary.pad(), | |
args.decoder_input_dim, | |
args.adaptive_input_factor, | |
args.decoder_embed_dim, | |
utils.eval_str_list(args.adaptive_input_cutoff, type=int), | |
) | |
else: | |
embed_tokens = Embedding( | |
len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() | |
) | |
if args.tie_adaptive_weights: | |
assert args.adaptive_input | |
assert args.adaptive_input_factor == args.adaptive_softmax_factor | |
assert ( | |
args.adaptive_softmax_cutoff == args.adaptive_input_cutoff | |
), "{} != {}".format( | |
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff | |
) | |
assert args.decoder_input_dim == args.decoder_output_dim | |
decoder = LightConvDecoder( | |
args, | |
task.output_dictionary, | |
embed_tokens, | |
no_encoder_attn=True, | |
final_norm=False, | |
) | |
return LightConvLanguageModel(decoder) | |
def base_lm_architecture(args): | |
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) | |
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) | |
args.decoder_layers = getattr(args, "decoder_layers", 6) | |
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) | |
args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) | |
args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) | |
args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) | |
args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) | |
args.character_embeddings = getattr(args, "character_embeddings", False) | |
args.decoder_output_dim = getattr( | |
args, "decoder_output_dim", args.decoder_embed_dim | |
) | |
args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) | |
args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) | |
# The model training is not stable without this | |
args.decoder_normalize_before = True | |
args.adaptive_input = getattr(args, "adaptive_input", False) | |
args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) | |
args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) | |
args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) | |
args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) | |
args.decoder_kernel_size_list = getattr( | |
args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] | |
) | |
if len(args.decoder_kernel_size_list) == 1: | |
args.decoder_kernel_size_list = ( | |
args.decoder_kernel_size_list * args.decoder_layers | |
) | |
assert ( | |
len(args.decoder_kernel_size_list) == args.decoder_layers | |
), "decoder_kernel_size_list doesn't match decoder_layers" | |
args.decoder_glu = getattr(args, "decoder_glu", True) | |
args.input_dropout = getattr(args, "input_dropout", 0.1) | |
args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) | |
def lightconv_lm_gbw(args): | |
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) | |
args.dropout = getattr(args, "dropout", 0.1) | |
args.attention_dropout = getattr(args, "attention_dropout", 0.1) | |
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) | |
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) | |
base_lm_architecture(args) | |