Spaces:
Runtime error
Runtime error
File size: 2,932 Bytes
e202b16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import pytest
import torch
from xformers.components import Activation
from xformers.components.feedforward import FEEDFORWARD_REGISTRY, build_feedforward
from xformers.components.feedforward.mixture_of_experts import GateConfig
from xformers.helpers.test_utils import init_torch_distributed_local
BATCH = 4
SEQ = 256
EMBD = 16
LATENT = 128
DROPOUT = 0.5
DEVICES = (
[torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
)
assert FEEDFORWARD_REGISTRY.keys(), "Feedforward layers should have been registered"
@pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
@pytest.mark.parametrize("activation", [a.value for a in Activation])
@pytest.mark.parametrize("device", DEVICES)
def test_feedforward(
feedforward_name: str, activation: Activation, device: torch.device
):
test_config = {
"name": feedforward_name,
"dim_model": LATENT,
"dropout": DROPOUT,
"activation": activation,
"hidden_layer_multiplier": 4,
"number_of_experts": 4, # MoE
"gate": "top_2", # MoE
}
if feedforward_name == "MixtureOfExperts":
init_torch_distributed_local()
# dummy, just check construction and dimensions in the FW pass
ffw = build_feedforward(test_config)
if ffw.requires_cuda and not device.type == "cuda":
# pyre-fixme[29]: The library function `pytest.skip` is not supported by Pyre.
pytest.skip("This MLP requires CUDA and current device does not match")
inputs = torch.rand(BATCH, SEQ, LATENT, device=device)
ffw = ffw.to(device)
_ = ffw(inputs)
def get_expert():
return torch.nn.Linear(LATENT, LATENT, bias=False)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires CUDA")
@pytest.mark.parametrize("gate", [g.value for g in GateConfig])
@pytest.mark.parametrize("number_of_local_experts", [None, 4])
@pytest.mark.parametrize("expert_constructor", [None, get_expert])
def test_moe(gate, number_of_local_experts, expert_constructor):
test_config = {
"name": "MixtureOfExperts",
"dim_model": LATENT,
"dropout": DROPOUT,
"activation": Activation.ReLU,
"hidden_layer_multiplier": 4,
"number_of_experts": 4,
"number_of_local_experts": number_of_local_experts,
"gate": gate,
"expert_constructor": expert_constructor,
}
init_torch_distributed_local()
# dummy, just check construction and dimensions in the FW pass
ffw = build_feedforward(test_config)
inputs = torch.rand(BATCH, SEQ, LATENT, device=torch.device("cuda"))
ffw = ffw.to(torch.device("cuda"))
outputs = ffw(inputs)
loss = torch.sum(outputs)
loss.backward()
|