Spaces:

shymaa99
/

deploy-s2s-api

Runtime error

App Files Files Community

3v324v23 commited on Apr 19, 2024

Commit

7e6ee0b

1 Parent(s): c2bb5f2

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

customs/customsf +0 -0
data/__init__.py +0 -3
data/__pycache__/__init__.cpython-311.pyc +0 -0
data/__pycache__/collation.cpython-311.pyc +0 -0
data/__pycache__/input_strategies.cpython-311.pyc +0 -0
data/__pycache__/tokenizer.cpython-311.pyc +0 -0
data/collation.py +0 -120
data/datamodule.py +0 -419
data/dataset.py +0 -242
data/fbank.py +0 -212
data/input_strategies.py +0 -159
data/tokenizer.py +0 -126
macros.py +0 -44
models/__init__.py +0 -136
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/macros.cpython-311.pyc +0 -0
models/__pycache__/transformer.cpython-311.pyc +0 -0
models/__pycache__/vallex.cpython-311.pyc +0 -0
models/__pycache__/visualizer.cpython-311.pyc +0 -0
models/macros.py +0 -11
models/transformer.py +0 -394
models/vallex.py +0 -853
models/visualizer.py +0 -106
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-311.pyc +0 -0
modules/__pycache__/activation.cpython-311.pyc +0 -0
modules/__pycache__/embedding.cpython-311.pyc +0 -0
modules/__pycache__/scaling.cpython-311.pyc +0 -0
modules/__pycache__/transformer.cpython-311.pyc +0 -0
modules/activation.py +0 -612
modules/embedding.py +0 -97
modules/optim.py +0 -1105
modules/scaling.py +0 -1401
modules/scheduler.py +0 -78
modules/transformer.py +0 -683
prompts/promptsf +0 -0
utils/__init__.py +0 -15
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/generation.cpython-311.pyc +0 -0
utils/__pycache__/prompt_making.cpython-311.pyc +0 -0
utils/__pycache__/sentence_cutter.cpython-311.pyc +0 -0
utils/__pycache__/symbol_table.cpython-311.pyc +0 -0
utils/download.py +0 -49
utils/g2p/__init__.py +0 -72
utils/g2p/__pycache__/__init__.cpython-311.pyc +0 -0
utils/g2p/__pycache__/cleaners.cpython-311.pyc +0 -0
utils/g2p/__pycache__/english.cpython-311.pyc +0 -0
utils/g2p/__pycache__/japanese.cpython-311.pyc +0 -0
utils/g2p/__pycache__/mandarin.cpython-311.pyc +0 -0
utils/g2p/__pycache__/symbols.cpython-311.pyc +0 -0

customs/customsf DELETED Viewed

File without changes

data/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-# from .datamodule import *
-# from .tokenizer import *
-from .collation import *

data/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (208 Bytes)

data/__pycache__/collation.cpython-311.pyc DELETED Viewed

Binary file (7.2 kB)

data/__pycache__/input_strategies.cpython-311.pyc DELETED Viewed

Binary file (1.8 kB)

data/__pycache__/tokenizer.cpython-311.pyc DELETED Viewed

Binary file (6.77 kB)

data/collation.py DELETED Viewed

@@ -1,120 +0,0 @@
-from pathlib import Path
-from typing import List, Tuple
-import numpy as np
-import torch
-from utils import SymbolTable
-class TextTokenCollater:
-    """Collate list of text tokens
-    Map sentences to integers. Sentences are padded to equal length.
-    Beginning and end-of-sequence symbols can be added.
-    Example:
-        >>> token_collater = TextTokenCollater(text_tokens)
-        >>> tokens_batch, tokens_lens = token_collater(text)
-    Returns:
-        tokens_batch: IntTensor of shape (B, L)
-            B: batch dimension, number of input sentences
-            L: length of the longest sentence
-        tokens_lens: IntTensor of shape (B,)
-            Length of each sentence after adding <eos> and <bos>
-            but before padding.
-    """
-    def __init__(
-        self,
-        text_tokens: List[str],
-        add_eos: bool = True,
-        add_bos: bool = True,
-        pad_symbol: str = "<pad>",
-        bos_symbol: str = "<bos>",
-        eos_symbol: str = "<eos>",
-    ):
-        self.pad_symbol = pad_symbol
-        self.add_eos = add_eos
-        self.add_bos = add_bos
-        self.bos_symbol = bos_symbol
-        self.eos_symbol = eos_symbol
-        unique_tokens = (
-            [pad_symbol]
-            + ([bos_symbol] if add_bos else [])
-            + ([eos_symbol] if add_eos else [])
-            + sorted(text_tokens)
-        )
-        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
-        self.idx2token = [token for token in unique_tokens]
-    def index(
-        self, tokens_list: List[str]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        seqs, seq_lens = [], []
-        for tokens in tokens_list:
-            assert (
-                all([True if s in self.token2idx else False for s in tokens])
-                is True
-            )
-            seq = (
-                ([self.bos_symbol] if self.add_bos else [])
-                + list(tokens)
-                + ([self.eos_symbol] if self.add_eos else [])
-            )
-            seqs.append(seq)
-            seq_lens.append(len(seq))
-        max_len = max(seq_lens)
-        for k, (seq, seq_len) in enumerate(zip(seqs, seq_lens)):
-            seq.extend([self.pad_symbol] * (max_len - seq_len))
-        tokens = torch.from_numpy(
-            np.array(
-                [[self.token2idx[token] for token in seq] for seq in seqs],
-                dtype=np.int64,
-            )
-        )
-        tokens_lens = torch.IntTensor(seq_lens)
-        return tokens, tokens_lens
-    def __call__(self, texts: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
-        tokens_seqs = [[p for p in text] for text in texts]
-        max_len = len(max(tokens_seqs, key=len))
-        seqs = [
-            ([self.bos_symbol] if self.add_bos else [])
-            + list(seq)
-            + ([self.eos_symbol] if self.add_eos else [])
-            + [self.pad_symbol] * (max_len - len(seq))
-            for seq in tokens_seqs
-        ]
-        tokens_batch = torch.from_numpy(
-            np.array(
-                [seq for seq in seqs],
-                dtype=np.int64,
-            )
-        )
-        tokens_lens = torch.IntTensor(
-            [
-                len(seq) + int(self.add_eos) + int(self.add_bos)
-                for seq in tokens_seqs
-            ]
-        )
-        return tokens_batch, tokens_lens
-def get_text_token_collater() -> TextTokenCollater:
-    collater = TextTokenCollater(
-        ['0'], add_bos=False, add_eos=False
-    )
-    return collater

data/datamodule.py DELETED Viewed

@@ -1,419 +0,0 @@
-# Copyright      2023                          (authors: Feiteng Li)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import inspect
-import logging
-from functools import lru_cache
-from pathlib import Path
-from typing import Any, Dict, Optional
-import torch
-# from icefall.utils import str2bool
-# from lhotse import CutSet, load_manifest_lazy
-# from lhotse.dataset import (
-#     CutConcatenate,
-#     DynamicBucketingSampler,
-#     PrecomputedFeatures,
-#     SingleCutSampler,
-#     SpecAugment,
-# )
-# from lhotse.dataset.input_strategies import OnTheFlyFeatures
-# from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-from data.collation import get_text_token_collater
-# from data.dataset import SpeechSynthesisDataset
-from data.fbank import get_fbank_extractor
-from data.input_strategies import PromptedPrecomputedFeatures
-# PrecomputedFeatures = PrecomputedFeatures
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-def _get_input_strategy(input_strategy, dataset, cuts):
-    if input_strategy == "PromptedPrecomputedFeatures":
-        return PromptedPrecomputedFeatures(dataset, cuts)
-    return eval(input_strategy)()
-class TtsDataModule:
-    """
-    DataModule for VALL-E TTS experiments.
-    It assumes there is always one train and valid dataloader.
-    It contains all the common data pipeline modules used in TTS
-    experiments, e.g.:
-    - dynamic batch size,
-    - bucketing samplers,
-    - cut concatenation[not used & tested yet],
-    - augmentation[not used & tested yet],
-    - on-the-fly feature extraction[not used & tested yet]
-    This class should be derived for specific corpora used in TTS tasks.
-    """
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="TTS data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/tokenized"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=40.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=10,
-            help="The number of buckets for the DynamicBucketingSampler"
-            "(you might want to increase it for larger datasets).",
-        )
-        group.add_argument(
-            "--concatenate-cuts",
-            type=str2bool,
-            default=False,
-            help="When enabled, utterances (cuts) will be concatenated "
-            "to minimize the amount of padding.",
-        )
-        group.add_argument(
-            "--duration-factor",
-            type=float,
-            default=1.0,
-            help="Determines the maximum duration of a concatenated cut "
-            "relative to the duration of the longest cut in a batch.",
-        )
-        group.add_argument(
-            "--gap",
-            type=float,
-            default=0.1,
-            help="The amount of padding (in seconds) inserted between "
-            "concatenated cuts. This padding is filled with noise when "
-            "noise augmentation is used.",
-        )
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available.",
-        )
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-        group.add_argument(
-            "--drop-last",
-            type=str2bool,
-            default=False,
-            help="Whether to drop last batch. Used by sampler.",
-        )
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=8,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=False,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-        group.add_argument(
-            "--input-strategy",
-            type=str,
-            default="PrecomputedFeatures",
-            help="AudioSamples or PrecomputedFeatures or PromptedPrecomputedFeatures",
-        )
-        group.add_argument(
-            "--dataset",
-            type=str,
-            default="ljspeech",
-            help="--input-strategy PromptedPrecomputedFeatures needs dataset name to prepare prompts.",
-        )
-        parser.add_argument(
-            "--text-tokens",
-            type=str,
-            default="data/tokenized/unique_text_tokens.k2symbols",
-            help="Path to the unique text tokens file",
-        )
-        parser.add_argument(
-            "--sampling-rate",
-            type=int,
-            default=24000,
-            help="""Audio sampling rate.""",
-        )
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        sampler_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            CutSet for training.
-          sampler_state_dict:
-            The state dict for the training sampler.
-        """
-        transforms = []
-        if self.args.concatenate_cuts:
-            logging.info(
-                f"Using cut concatenation with duration factor "
-                f"{self.args.duration_factor} and gap {self.args.gap}."
-            )
-            # Cut concatenation should be the first transform in the list,
-            # so that if we e.g. mix noise in, it will fill the gaps between
-            # different utterances.
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-        input_transforms = []
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(
-                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
-            )
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-        logging.info("About to create train dataset")
-        if self.args.on_the_fly_feats:
-            # NOTE: the PerturbSpeed transform should be added only if we
-            # remove it from data prep stage.
-            # Add on-the-fly speed perturbation; since originally it would
-            # have increased epoch size by 3, we will apply prob 2/3 and use
-            # 3x more epochs.
-            # Speed perturbation probably should come first before
-            # concatenation, but in principle the transforms order doesn't have
-            # to be strict (e.g. could be randomized)
-            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
-            # Drop feats to be on the safe side.
-            train = SpeechSynthesisDataset(
-                get_text_token_collater(self.args.text_tokens),
-                cut_transforms=transforms,
-                feature_input_strategy=OnTheFlyFeatures(get_fbank_extractor()),
-                feature_transforms=input_transforms,
-            )
-        else:
-            train = SpeechSynthesisDataset(
-                get_text_token_collater(self.args.text_tokens),
-                feature_input_strategy=_get_input_strategy(
-                    self.args.input_strategy, self.args.dataset, cuts_train
-                ),
-                cut_transforms=transforms,
-                feature_transforms=input_transforms,
-            )
-        if self.args.bucketing_sampler:
-            logging.info("Using DynamicBucketingSampler")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=self.args.drop_last,
-            )
-        else:
-            logging.info(
-                "Using SingleCutSampler and sort by duraton(ascending=True)."
-            )
-            cuts_train = cuts_train.to_eager().sort_by_duration(ascending=True)
-            train_sampler = SingleCutSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-            )
-        logging.info("About to create train dataloader")
-        if sampler_state_dict is not None:
-            logging.info("Loading sampler state dict")
-            train_sampler.load_state_dict(sampler_state_dict)
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-        return train_dl
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = SpeechSynthesisDataset(
-                get_text_token_collater(self.args.text_tokens),
-                feature_input_strategy=OnTheFlyFeatures(get_fbank_extractor()),
-                cut_transforms=[],
-            )
-        else:
-            validate = SpeechSynthesisDataset(
-                get_text_token_collater(self.args.text_tokens),
-                feature_input_strategy=_get_input_strategy(
-                    self.args.input_strategy, self.args.dataset, cuts_valid
-                ),
-                cut_transforms=[],
-            )
-        valid_sampler = DynamicBucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=4,
-            persistent_workers=False,
-        )
-        return valid_dl
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = SpeechSynthesisDataset(
-            get_text_token_collater(self.args.text_tokens),
-            feature_input_strategy=OnTheFlyFeatures(get_fbank_extractor())
-            if self.args.on_the_fly_feats
-            else _get_input_strategy(
-                self.args.input_strategy, self.args.dataset, cuts
-            ),
-            cut_transforms=[],
-        )
-        sampler = DynamicBucketingSampler(
-            cuts,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
-    @lru_cache()
-    def train_cuts(self) -> CutSet:
-        logging.info("About to get train cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "cuts_train.jsonl.gz"
-        )
-    @lru_cache()
-    def dev_cuts(self) -> CutSet:
-        logging.info("About to get dev cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "cuts_dev.jsonl.gz")
-    @lru_cache()
-    def test_cuts(self) -> CutSet:
-        logging.info("About to get test cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "cuts_test.jsonl.gz")

data/dataset.py DELETED Viewed

@@ -1,242 +0,0 @@
-# Copyright      2023                           (authors: Feiteng Li)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-modified from lhoste.dataset.speech_synthesis.py
-"""
-import torch
-import math
-import h5py
-from tokenizers import Tokenizer
-from typing import Union, List
-import numpy as np
-from tqdm import tqdm
-_pad        = '_'
-_punctuation = ',.!?-~…'
-_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
-symbols = [_pad] + list(_punctuation) + list(_letters)
-language_dict = {
-    'en': 0,
-    'zh': 1,
-    'ja': 2,
-}
-def seq2phone(tokens: Union[List, np.ndarray]):
-    """
-    Convert tokenized phoneme ID sequence back to phoneme string
-    :param tokens: phoneme tokens
-    :return: recovered phoneme sequence
-    """
-    phones = "".join([symbols[i] for i in tokens])
-    return phones
-class DynamicBatchSampler(torch.utils.data.Sampler):
-    def __init__(self, sampler, num_tokens_fn, num_buckets=100, min_size=0, max_size=1000,
-                 max_tokens=None, max_sentences=None, drop_last=False):
-        """
-        :param sampler:
-        :param num_tokens_fn: 根据idx返回样本的长度的函数
-        :param num_buckets: 利用桶原理将相似长度的样本放在一个batchsize中，桶的数量
-        :param min_size: 最小长度的样本， 小于这个值的样本会被过滤掉。 依据这个值来创建样桶
-        :param max_size: 最大长度的样本
-        :param max_sentences: batch_size, 但是这里可以通过max_sentences 和 max_tokens 共同控制最终的大小
-        """
-        super(DynamicBatchSampler, self).__init__(sampler)
-        self.sampler = sampler
-        self.num_tokens_fn = num_tokens_fn
-        self.num_buckets = num_buckets
-        self.min_size = min_size
-        self.max_size = max_size
-        assert max_size <= max_tokens, "max_size should be smaller than max tokens"
-        assert max_tokens is not None or max_sentences is not None, \
-            "max_tokens and max_sentences should not be null at the same time, please specify one parameter at least"
-        self.max_tokens = max_tokens if max_tokens is not None else float('Inf')
-        self.max_sentences = max_sentences if max_sentences is not None else float('Inf')
-        self.drop_last = drop_last
-    def set_epoch(self, epoch):
-        self.sampler.set_epoch(epoch)
-    def is_batch_full(self, num_tokens, batch):
-        if len(batch) == 0:
-            return False
-        if len(batch) == self.max_sentences:
-            return True
-        if num_tokens > self.max_tokens:
-            return True
-        return False
-    def __iter__(self):
-        buckets = [[] for _ in range(self.num_buckets)]
-        sample_len = [0] * self.num_buckets
-        for idx in self.sampler:
-            idx_length = self.num_tokens_fn(idx)
-            if not (self.min_size <= idx_length <= self.max_size):
-                print("sentence at index {} of size {} exceeds max_tokens, the sentence is ignored".format(idx, idx_length))
-                continue
-            index_buckets = math.floor((idx_length - self.min_size) / (self.max_size - self.min_size + 1)
-                                       * self.num_buckets)
-            sample_len[index_buckets] = max(sample_len[index_buckets], idx_length)
-            num_tokens = (len(buckets[index_buckets]) + 1) * sample_len[index_buckets]
-            if self.is_batch_full(num_tokens, buckets[index_buckets]):
-                # yield this batch
-                yield buckets[index_buckets]
-                buckets[index_buckets] = []
-                sample_len[index_buckets] = 0
-            buckets[index_buckets].append(idx)
-        # process left-over
-        leftover_batch = []
-        leftover_sample_len = 0
-        leftover = [idx for bucket in buckets for idx in bucket]
-        for idx in leftover:
-            idx_length = self.num_tokens_fn(idx)
-            leftover_sample_len = max(leftover_sample_len, idx_length)
-            num_tokens = (len(leftover_batch) + 1) * leftover_sample_len
-            if self.is_batch_full(num_tokens, leftover_batch):
-                yield leftover_batch
-                leftover_batch = []
-                leftover_sample_len = 0
-            leftover_batch.append(idx)
-        if len(leftover_batch) > 0 and not self.drop_last:
-            yield leftover_batch
-    def __len__(self):
-        # we do not know the exactly batch size, so do not call len(dataloader)
-        pass
-class AudioDataset(torch.utils.data.Dataset):
-    def __init__(self, h5_path, ann_path, tokenizer_path):
-        self.h5_path = h5_path
-        with open(ann_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-        ls = [l.split("|") for l in lines]
-        ls_T = list(zip(*ls))
-        del ls_T[-1]
-        self.h5_paths, self.durations, self.langs, self.texts = \
-            list(ls_T[0]), list(ls_T[1]), list(ls_T[2]), list(ls_T[3])
-        self.durations = [float(dur) for dur in self.durations]
-        self.tokenizer = Tokenizer.from_file(tokenizer_path)
-        self._archive = None
-    def __len__(self):
-        return len(self.h5_paths)
-    def get_dur(self, idx):
-        return self.durations[idx]
-    @property
-    def archive(self):
-        if self._archive is None:  # lazy loading here!
-            self._archive = h5py.File(self.h5_path, "r")
-        return self._archive
-    def __getitem__(self, idx):
-        archive = self.archive
-        h5_path = self.h5_paths[idx]
-        sub = archive[h5_path]
-        audio_tokens = sub['audio'][()]
-        phone_tokens = sub['text'][()]
-        dur = self.durations[idx]
-        lang = self.langs[idx]
-        text = self.texts[idx]
-        # tokenization should be done within dataloader
-        phones = seq2phone(phone_tokens)
-        phones = phones.replace(" ", "_")
-        if not len(phones):
-            cptpho_tokens = self.tokenizer.encode(text).ids
-        else:
-            cptpho_tokens = self.tokenizer.encode(phones).ids
-        assert len(cptpho_tokens)
-        return {
-            'utt_id': h5_path,
-            'text': text,
-            'audio': None,
-            'audio_lens': None,
-            'audio_features': audio_tokens,
-            'audio_features_lens': len(audio_tokens.T),
-            'text_tokens': np.array(cptpho_tokens),
-            'text_tokens_lens': len(cptpho_tokens),
-            'language': language_dict[lang],
-        }
-def collate(batch):
-    utt_id_s = [b['utt_id'] for b in batch]
-    text_s = [b['text'] for b in batch]
-    audio_s = [b['audio'] for b in batch]
-    audio_lens_s = [b['audio_lens'] for b in batch]
-    audio_features_lens_s = [b['audio_features_lens'] for b in batch]
-    # create an empty tensor with maximum audio feature length
-    audio_features_s = torch.zeros([len(batch), max(audio_features_lens_s), 8], dtype=torch.int64) - 1 # audio pad with -1
-    text_tokens_lens_s = [b['text_tokens_lens'] for b in batch]
-    # create an empty tensor with maximum text tokens length
-    text_tokens_s = torch.zeros([len(batch), max(text_tokens_lens_s)], dtype=torch.int64) + 3 # [PAD] token id 3
-    language_s = [b['language'] for b in batch]
-    for i, b in enumerate(batch):
-        audio_features = b['audio_features']
-        audio_features_lens = b['audio_features_lens']
-        audio_features_s[i, :audio_features_lens, :] = torch.LongTensor(audio_features.T)
-        text_tokens = b['text_tokens']
-        text_tokens_lens = b['text_tokens_lens']
-        text_tokens_s[i, :text_tokens_lens] = torch.LongTensor(text_tokens)
-    batch = {
-        'utt_id': utt_id_s,
-        'text': text_s,
-        'audio': audio_s,
-        'audio_lens': audio_lens_s,
-        'audio_features': audio_features_s,
-        'audio_features_lens': torch.LongTensor(np.array(audio_features_lens_s)),
-        'text_tokens': text_tokens_s,
-        'text_tokens_lens': torch.LongTensor(np.array(text_tokens_lens_s)),
-        'languages': torch.LongTensor(np.array(language_s)),
-    }
-    return batch
-def create_dataloader(data_dir="/root/valle/egs/mix", n_gpus=1, rank=0, num_workers=0, num_buckets=10, max_duration=120):
-    train_dataset = AudioDataset(h5_path=f"{data_dir}/audio_sum.hdf5",
-                                 ann_path=f"{data_dir}/audio_ann_sum.txt",
-                                 tokenizer_path=f"{data_dir}/bpe_69.json")
-    ran_sampler = torch.utils.data.distributed.DistributedSampler(
-            train_dataset,
-            num_replicas=n_gpus,
-            rank=rank,
-            shuffle=True,
-        )
-    dynamic_sampler = DynamicBatchSampler(ran_sampler, train_dataset.get_dur, num_buckets=num_buckets, max_size=20,
-                                          max_tokens=max_duration)
-    train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=num_workers, collate_fn=collate,
-                                               batch_sampler=dynamic_sampler)
-    return train_loader

data/fbank.py DELETED Viewed

@@ -1,212 +0,0 @@
-# Copyright      2023                          (authors: Feiteng Li)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import asdict, dataclass
-from typing import Any, Dict, Optional, Union
-import numpy as np
-import torch
-# from lhotse.features.base import FeatureExtractor
-# from lhotse.utils import EPSILON, Seconds, compute_num_frames
-from librosa.filters import mel as librosa_mel_fn
-@dataclass
-class BigVGANFbankConfig:
-    # Spectogram-related part
-    # Note that frame_length and frame_shift will be converted to milliseconds before torchaudio/Kaldi sees them
-    frame_length: Seconds = 1024 / 24000.0
-    frame_shift: Seconds = 256 / 24000.0
-    remove_dc_offset: bool = True
-    round_to_power_of_two: bool = True
-    # Fbank-related part
-    low_freq: float = 0.0
-    high_freq: float = 12000.0
-    num_mel_bins: int = 100
-    use_energy: bool = False
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-    @staticmethod
-    def from_dict(data: Dict[str, Any]) -> "BigVGANFbankConfig":
-        return BigVGANFbankConfig(**data)
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-# https://github.com/NVIDIA/BigVGAN
-# bigvgan_24khz_100band https://drive.google.com/drive/folders/1EpxX6AsxjCbbk0mmAhE0td6eYiABr8Oz
-class BigVGANFbank(FeatureExtractor):
-    name = "fbank"
-    config_type = BigVGANFbankConfig
-    def __init__(self, config: Optional[Any] = None):
-        super(BigVGANFbank, self).__init__(config)
-        sampling_rate = 24000
-        self.mel_basis = torch.from_numpy(
-            librosa_mel_fn(
-                sampling_rate,
-                1024,
-                self.config.num_mel_bins,
-                self.config.low_freq,
-                self.config.high_freq,
-            ).astype(np.float32)
-        )
-        self.hann_window = torch.hann_window(1024)
-    def _feature_fn(self, samples, **kwargs):
-        win_length, n_fft = 1024, 1024
-        hop_size = 256
-        if True:
-            sampling_rate = 24000
-            duration = round(samples.shape[-1] / sampling_rate, ndigits=12)
-            expected_num_frames = compute_num_frames(
-                duration=duration,
-                frame_shift=self.frame_shift,
-                sampling_rate=sampling_rate,
-            )
-            pad_size = (
-                (expected_num_frames - 1) * hop_size
-                + win_length
-                - samples.shape[-1]
-            )
-            assert pad_size >= 0
-            y = torch.nn.functional.pad(
-                samples,
-                (0, pad_size),
-                mode="constant",
-            )
-        else:
-            y = torch.nn.functional.pad(
-                samples,
-                (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
-                mode="reflect",
-            )
-        y = y.squeeze(1)
-        # complex tensor as default, then use view_as_real for future pytorch compatibility
-        spec = torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_length,
-            window=self.hann_window,
-            center=False,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-        spec = torch.view_as_real(spec)
-        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-        spec = torch.matmul(self.mel_basis, spec)
-        spec = spectral_normalize_torch(spec)
-        return spec.transpose(2, 1).squeeze(0)
-    def extract(
-        self, samples: Union[np.ndarray, torch.Tensor], sampling_rate: int
-    ) -> np.ndarray:
-        assert sampling_rate == 24000
-        params = asdict(self.config)
-        params.update({"sample_frequency": sampling_rate, "snip_edges": False})
-        params["frame_shift"] *= 1000.0
-        params["frame_length"] *= 1000.0
-        if not isinstance(samples, torch.Tensor):
-            samples = torch.from_numpy(samples)
-        # Torchaudio Kaldi feature extractors expect the channel dimension to be first.
-        if len(samples.shape) == 1:
-            samples = samples.unsqueeze(0)
-        features = self._feature_fn(samples, **params).to(torch.float32)
-        return features.numpy()
-    @property
-    def frame_shift(self) -> Seconds:
-        return self.config.frame_shift
-    def feature_dim(self, sampling_rate: int) -> int:
-        return self.config.num_mel_bins
-    @staticmethod
-    def mix(
-        features_a: np.ndarray,
-        features_b: np.ndarray,
-        energy_scaling_factor_b: float,
-    ) -> np.ndarray:
-        return np.log(
-            np.maximum(
-                # protection against log(0); max with EPSILON is adequate since these are energies (always >= 0)
-                EPSILON,
-                np.exp(features_a)
-                + energy_scaling_factor_b * np.exp(features_b),
-            )
-        )
-    @staticmethod
-    def compute_energy(features: np.ndarray) -> float:
-        return float(np.sum(np.exp(features)))
-def get_fbank_extractor() -> BigVGANFbank:
-    return BigVGANFbank(BigVGANFbankConfig())
-if __name__ == "__main__":
-    extractor = BigVGANFbank(BigVGANFbankConfig())
-    samples = torch.from_numpy(np.random.random([1000]).astype(np.float32))
-    samples = torch.clip(samples, -1.0, 1.0)
-    fbank = extractor.extract(samples, 24000.0)
-    print(f"fbank {fbank.shape}")
-    from scipy.io.wavfile import read
-    MAX_WAV_VALUE = 32768.0
-    sampling_rate, samples = read(
-        "egs/libritts/prompts/5639_40744_000000_000002.wav"
-    )
-    print(f"samples: [{samples.min()}, {samples.max()}]")
-    fbank = extractor.extract(samples.astype(np.float32) / MAX_WAV_VALUE, 24000)
-    print(f"fbank {fbank.shape}")
-    import matplotlib.pyplot as plt
-    _ = plt.figure(figsize=(18, 10))
-    plt.imshow(
-        X=fbank.transpose(1, 0),
-        cmap=plt.get_cmap("jet"),
-        aspect="auto",
-        interpolation="nearest",
-    )
-    plt.gca().invert_yaxis()
-    plt.savefig("egs/libritts/prompts/5639_40744_000000_000002.png")
-    plt.close()
-    print("fbank test PASS!")

data/input_strategies.py DELETED Viewed

@@ -1,159 +0,0 @@
-import random
-from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor
-from typing import Tuple, Type
-# from lhotse import CutSet
-# from lhotse.dataset.collation import collate_features
-# from lhotse.dataset.input_strategies import (
-#     ExecutorType,
-#     PrecomputedFeatures,
-#     _get_executor,
-# )
-# from lhotse.utils import fastcopy
-class PromptedFeatures:
-    def __init__(self, prompts, features):
-        self.prompts = prompts
-        self.features = features
-    def to(self, device):
-        return PromptedFeatures(
-            self.prompts.to(device), self.features.to(device)
-        )
-    def sum(self):
-        return self.features.sum()
-    @property
-    def ndim(self):
-        return self.features.ndim
-    @property
-    def data(self):
-        return (self.prompts, self.features)
-# class PromptedPrecomputedFeatures(PrecomputedFeatures):
-#     """
-#     :class:`InputStrategy` that reads pre-computed features, whose manifests
-#     are attached to cuts, from disk.
-#
-#     It automatically pads the feature matrices with pre or post feature.
-#
-#     .. automethod:: __call__
-#     """
-#
-#     def __init__(
-#         self,
-#         dataset: str,
-#         cuts: CutSet,
-#         num_workers: int = 0,
-#         executor_type: Type[ExecutorType] = ThreadPoolExecutor,
-#     ) -> None:
-#         super(PromptedPrecomputedFeatures, self).__init__(
-#             num_workers, executor_type
-#         )
-#
-#         self.utt2neighbors = defaultdict(lambda: [])
-#
-#         if dataset.lower() == "libritts":
-#             # 909_131041_000013_000002
-#             # 909_131041_000013_000003
-#             speaker2utts = defaultdict(lambda: [])
-#
-#             utt2cut = {}
-#             for cut in cuts:
-#                 speaker = cut.supervisions[0].speaker
-#                 speaker2utts[speaker].append(cut.id)
-#                 utt2cut[cut.id] = cut
-#
-#             for spk in speaker2utts:
-#                 uttids = sorted(speaker2utts[spk])
-#                 # Using the property of sorted keys to find previous utterance
-#                 # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001
-#                 if len(uttids) == 1:
-#                     self.utt2neighbors[uttids[0]].append(utt2cut[uttids[0]])
-#                     continue
-#
-#                 utt2prevutt = dict(zip(uttids, [uttids[1]] + uttids[:-1]))
-#                 utt2postutt = dict(zip(uttids[:-1], uttids[1:]))
-#
-#                 for utt in utt2prevutt:
-#                     self.utt2neighbors[utt].append(utt2cut[utt2prevutt[utt]])
-#
-#                 for utt in utt2postutt:
-#                     self.utt2neighbors[utt].append(utt2cut[utt2postutt[utt]])
-#         elif dataset.lower() == "ljspeech":
-#             utt2cut = {}
-#             uttids = []
-#             for cut in cuts:
-#                 uttids.append(cut.id)
-#                 utt2cut[cut.id] = cut
-#
-#             if len(uttids) == 1:
-#                 self.utt2neighbors[uttids[0]].append(utt2cut[uttids[0]])
-#             else:
-#                 # Using the property of sorted keys to find previous utterance
-#                 # The keys has structure: LJ001-0010
-#                 utt2prevutt = dict(zip(uttids, [uttids[1]] + uttids[:-1]))
-#                 utt2postutt = dict(zip(uttids[:-1], uttids[1:]))
-#
-#                 for utt in utt2postutt:
-#                     postutt = utt2postutt[utt]
-#                     if utt[:5] == postutt[:5]:
-#                         self.utt2neighbors[utt].append(utt2cut[postutt])
-#
-#                 for utt in utt2prevutt:
-#                     prevutt = utt2prevutt[utt]
-#                     if utt[:5] == prevutt[:5] or not self.utt2neighbors[utt]:
-#                         self.utt2neighbors[utt].append(utt2cut[prevutt])
-#         else:
-#             raise ValueError
-#
-#     def __call__(
-#         self, cuts: CutSet
-#     ) -> Tuple[PromptedFeatures, PromptedFeatures]:
-#         """
-#         Reads the pre-computed features from disk/other storage.
-#         The returned shape is``(B, T, F) => (batch_size, num_frames, num_features)``.
-#
-#         :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.
-#         """
-#         features, features_lens = collate_features(
-#             cuts,
-#             executor=_get_executor(
-#                 self.num_workers, executor_type=self._executor_type
-#             ),
-#         )
-#
-#         prompts_cuts = []
-#         for k, cut in enumerate(cuts):
-#             prompts_cut = random.choice(self.utt2neighbors[cut.id])
-#             prompts_cuts.append(fastcopy(prompts_cut, id=f"{cut.id}-{str(k)}"))
-#
-#         mini_duration = min([cut.duration for cut in prompts_cuts] + [3.0])
-#         # prompts_cuts = CutSet.from_cuts(prompts_cuts).truncate(
-#         #     max_duration=mini_duration,
-#         #     offset_type="random",
-#         #     preserve_id=True,
-#         # )
-#         prompts_cuts = CutSet(
-#             cuts={k: cut for k, cut in enumerate(prompts_cuts)}
-#         ).truncate(
-#             max_duration=mini_duration,
-#             offset_type="random",
-#             preserve_id=False,
-#         )
-#
-#         prompts, prompts_lens = collate_features(
-#             prompts_cuts,
-#             executor=_get_executor(
-#                 self.num_workers, executor_type=self._executor_type
-#             ),
-#         )
-#
-#         return PromptedFeatures(prompts, features), PromptedFeatures(
-#             prompts_lens, features_lens
-#         )

data/tokenizer.py DELETED Viewed

@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023                            (authors: Feiteng Li)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import re
-from dataclasses import asdict, dataclass
-from typing import Any, Dict, List, Optional, Pattern, Union
-import numpy as np
-import torch
-import torchaudio
-from encodec import EncodecModel
-from encodec.utils import convert_audio
-try:
-    from pypinyin import Style, pinyin
-    from pypinyin.style._utils import get_finals, get_initials
-except Exception:
-    pass
-def remove_encodec_weight_norm(model):
-    from encodec.modules import SConv1d
-    from encodec.modules.seanet import SConvTranspose1d, SEANetResnetBlock
-    from torch.nn.utils import remove_weight_norm
-    encoder = model.encoder.model
-    for key in encoder._modules:
-        if isinstance(encoder._modules[key], SEANetResnetBlock):
-            remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
-            block_modules = encoder._modules[key].block._modules
-            for skey in block_modules:
-                if isinstance(block_modules[skey], SConv1d):
-                    remove_weight_norm(block_modules[skey].conv.conv)
-        elif isinstance(encoder._modules[key], SConv1d):
-            remove_weight_norm(encoder._modules[key].conv.conv)
-    decoder = model.decoder.model
-    for key in decoder._modules:
-        if isinstance(decoder._modules[key], SEANetResnetBlock):
-            remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
-            block_modules = decoder._modules[key].block._modules
-            for skey in block_modules:
-                if isinstance(block_modules[skey], SConv1d):
-                    remove_weight_norm(block_modules[skey].conv.conv)
-        elif isinstance(decoder._modules[key], SConvTranspose1d):
-            remove_weight_norm(decoder._modules[key].convtr.convtr)
-        elif isinstance(decoder._modules[key], SConv1d):
-            remove_weight_norm(decoder._modules[key].conv.conv)
-class AudioTokenizer:
-    """EnCodec audio."""
-    def __init__(
-        self,
-        device: Any = None,
-    ) -> None:
-        # Instantiate a pretrained EnCodec model
-        model = EncodecModel.encodec_model_24khz()
-        model.set_target_bandwidth(6.0)
-        remove_encodec_weight_norm(model)
-        if not device:
-            device = torch.device("cpu")
-            if torch.cuda.is_available():
-                device = torch.device("cuda:0")
-            if torch.backends.mps.is_available():
-                device = torch.device("mps")
-        self._device = device
-        self.codec = model.to(device)
-        self.sample_rate = model.sample_rate
-        self.channels = model.channels
-    @property
-    def device(self):
-        return self._device
-    def encode(self, wav: torch.Tensor) -> torch.Tensor:
-        return self.codec.encode(wav.to(self.device))
-    def decode(self, frames: torch.Tensor) -> torch.Tensor:
-        return self.codec.decode(frames)
-def tokenize_audio(tokenizer: AudioTokenizer, audio):
-    # Load and pre-process the audio waveform
-    if isinstance(audio, str):
-        wav, sr = torchaudio.load(audio)
-    else:
-        wav, sr = audio
-    wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
-    wav = wav.unsqueeze(0)
-    # Extract discrete codes from EnCodec
-    with torch.no_grad():
-        encoded_frames = tokenizer.encode(wav)
-    return encoded_frames
-if __name__ == "__main__":
-    model = EncodecModel.encodec_model_24khz()
-    model.set_target_bandwidth(6.0)
-    samples = torch.from_numpy(np.random.random([4, 1, 1600])).type(
-        torch.float32
-    )
-    codes_raw = model.encode(samples)
-    remove_encodec_weight_norm(model)
-    codes_norm = model.encode(samples)
-    assert torch.allclose(codes_raw[0][0], codes_norm[0][0])

macros.py DELETED Viewed

@@ -1,44 +0,0 @@
-NUM_LAYERS = 12
-NUM_HEAD = 16
-N_DIM = 1024
-PREFIX_MODE = 1
-NUM_QUANTIZERS = 8
-SAMPLE_RATE = 24000
-lang2token = {
-    'zh': "[ZH]",
-    'ja': "[JA]",
-    "en": "[EN]",
-    "AR": "[AR]",
-    'mix': "",
-}
-lang2code = {
-    'zh': 0,
-    'ja': 1,
-    "en": 2,
-    "ar": 3,
-}
-token2lang = {
-    '[ZH]': "zh",
-    '[JA]': "ja",
-    "[EN]": "en",
-    "[AR]": "ar",
-    "": "mix"
-}
-code2lang = {
-    0: 'zh',
-    1: 'ja',
-    2: "en",
-    3: "ar",
-}
-langdropdown2token = {
-    'English': "[EN]",
-    '中文': "[ZH]",
-    '日本語': "[JA]",
-    'عربي':"[AR]",
-    'Mix': "",
-}

models/__init__.py DELETED Viewed

@@ -1,136 +0,0 @@
-import argparse
-import torch.nn as nn
-# from icefall.utils import AttributeDict, str2bool
-from .macros import (
-    NUM_AUDIO_TOKENS,
-    NUM_MEL_BINS,
-    NUM_SPEAKER_CLASSES,
-    NUM_TEXT_TOKENS,
-    SPEAKER_EMBEDDING_DIM,
-)
-from .transformer import Transformer
-from .vallex import VALLE, VALLF
-from .visualizer import visualize
-def add_model_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="VALL-E",
-        help="VALL-E, VALL-F, Transformer.",
-    )
-    parser.add_argument(
-        "--decoder-dim",
-        type=int,
-        default=1024,
-        help="Embedding dimension in the decoder model.",
-    )
-    parser.add_argument(
-        "--nhead",
-        type=int,
-        default=16,
-        help="Number of attention heads in the Decoder layers.",
-    )
-    parser.add_argument(
-        "--num-decoder-layers",
-        type=int,
-        default=12,
-        help="Number of Decoder layers.",
-    )
-    parser.add_argument(
-        "--scale-factor",
-        type=float,
-        default=1.0,
-        help="Model scale factor which will be assigned different meanings in different models.",
-    )
-    parser.add_argument(
-        "--norm-first",
-        type=bool,
-        default=True,
-        help="Pre or Post Normalization.",
-    )
-    parser.add_argument(
-        "--add-prenet",
-        type=bool,
-        default=False,
-        help="Whether add PreNet after Inputs.",
-    )
-    # VALL-E & F
-    parser.add_argument(
-        "--prefix-mode",
-        type=int,
-        default=1,
-        help="The mode for how to prefix VALL-E NAR Decoder, "
-        "0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.",
-    )
-    parser.add_argument(
-        "--share-embedding",
-        type=bool,
-        default=True,
-        help="Share the parameters of the output projection layer with the parameters of the acoustic embedding.",
-    )
-    parser.add_argument(
-        "--prepend-bos",
-        type=bool,
-        default=False,
-        help="Whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs.",
-    )
-    parser.add_argument(
-        "--num-quantizers",
-        type=int,
-        default=8,
-        help="Number of Audio/Semantic quantization layers.",
-    )
-    # Transformer
-    parser.add_argument(
-        "--scaling-xformers",
-        type=bool,
-        default=False,
-        help="Apply Reworked Conformer scaling on Transformers.",
-    )
-def get_model(params) -> nn.Module:
-    if params.model_name.lower() in ["vall-f", "vallf"]:
-        model = VALLF(
-            params.decoder_dim,
-            params.nhead,
-            params.num_decoder_layers,
-            norm_first=params.norm_first,
-            add_prenet=params.add_prenet,
-            prefix_mode=params.prefix_mode,
-            share_embedding=params.share_embedding,
-            nar_scale_factor=params.scale_factor,
-            prepend_bos=params.prepend_bos,
-            num_quantizers=params.num_quantizers,
-        )
-    elif params.model_name.lower() in ["vall-e", "valle"]:
-        model = VALLE(
-            params.decoder_dim,
-            params.nhead,
-            params.num_decoder_layers,
-            norm_first=params.norm_first,
-            add_prenet=params.add_prenet,
-            prefix_mode=params.prefix_mode,
-            share_embedding=params.share_embedding,
-            nar_scale_factor=params.scale_factor,
-            prepend_bos=params.prepend_bos,
-            num_quantizers=params.num_quantizers,
-        )
-    else:
-        assert params.model_name in ["Transformer"]
-        model = Transformer(
-            params.decoder_dim,
-            params.nhead,
-            params.num_decoder_layers,
-            norm_first=params.norm_first,
-            add_prenet=params.add_prenet,
-            scaling_xformers=params.scaling_xformers,
-        )
-    return model

models/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (4.4 kB)

models/__pycache__/macros.cpython-311.pyc DELETED Viewed

Binary file (335 Bytes)

models/__pycache__/transformer.cpython-311.pyc DELETED Viewed

Binary file (15.1 kB)

models/__pycache__/vallex.cpython-311.pyc DELETED Viewed

Binary file (37.6 kB)

models/__pycache__/visualizer.cpython-311.pyc DELETED Viewed

Binary file (5.17 kB)

models/macros.py DELETED Viewed

@@ -1,11 +0,0 @@
-# Text
-NUM_TEXT_TOKENS = 2048
-# Audio
-NUM_AUDIO_TOKENS = 1024  # EnCodec RVQ bins
-NUM_MEL_BINS = 100  # BigVGAN bigvgan_24khz_100band
-# Speaker
-NUM_SPEAKER_CLASSES = 4096
-SPEAKER_EMBEDDING_DIM = 64

models/transformer.py DELETED Viewed

@@ -1,394 +0,0 @@
-# Copyright    2023                             (authors: Feiteng Li)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import Any, Dict, List, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-# from icefall.utils import make_pad_mask
-# from torchmetrics.classification import BinaryAccuracy
-from models.vallex import Transpose
-from modules.embedding import SinePositionalEmbedding, TokenEmbedding
-from modules.scaling import BalancedDoubleSwish, ScaledLinear
-from modules.transformer import (
-    BalancedBasicNorm,
-    IdentityNorm,
-    TransformerDecoderLayer,
-    TransformerEncoder,
-    TransformerEncoderLayer,
-)
-from .macros import NUM_MEL_BINS, NUM_TEXT_TOKENS
-from .visualizer import visualize
-IdentityNorm = IdentityNorm
-class Transformer(nn.Module):
-    """It implements seq2seq Transformer TTS for debug(No StopPredictor and SpeakerEmbeding)
-    Neural Speech Synthesis with Transformer Network
-    https://arxiv.org/abs/1809.08895
-    """
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        num_layers: int,
-        norm_first: bool = True,
-        add_prenet: bool = False,
-        scaling_xformers: bool = False,
-    ):
-        """
-        Args:
-          d_model:
-            The number of expected features in the input (required).
-          nhead:
-            The number of heads in the multiheadattention models (required).
-          num_layers:
-            The number of sub-decoder-layers in the decoder (required).
-        """
-        super().__init__()
-        self.text_embedding = TokenEmbedding(d_model, NUM_TEXT_TOKENS)  # W_x
-        if add_prenet:
-            self.encoder_prenet = nn.Sequential(
-                Transpose(),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                Transpose(),
-                nn.Linear(d_model, d_model),
-            )
-            self.decoder_prenet = nn.Sequential(
-                nn.Linear(NUM_MEL_BINS, 256),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Linear(256, 256),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Linear(256, d_model),
-            )
-            assert scaling_xformers is False  # TODO: update this block
-        else:
-            self.encoder_prenet = nn.Identity()
-            if scaling_xformers:
-                self.decoder_prenet = ScaledLinear(NUM_MEL_BINS, d_model)
-            else:
-                self.decoder_prenet = nn.Linear(NUM_MEL_BINS, d_model)
-        self.encoder_position = SinePositionalEmbedding(
-            d_model,
-            dropout=0.1,
-            scale=False,
-        )
-        self.decoder_position = SinePositionalEmbedding(
-            d_model, dropout=0.1, scale=False
-        )
-        if scaling_xformers:
-            self.encoder = TransformerEncoder(
-                TransformerEncoderLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward=d_model * 4,
-                    dropout=0.1,
-                    batch_first=True,
-                    norm_first=norm_first,
-                    linear1_self_attention_cls=ScaledLinear,
-                    linear2_self_attention_cls=partial(
-                        ScaledLinear, initial_scale=0.01
-                    ),
-                    linear1_feedforward_cls=ScaledLinear,
-                    linear2_feedforward_cls=partial(
-                        ScaledLinear, initial_scale=0.01
-                    ),
-                    activation=partial(
-                        BalancedDoubleSwish,
-                        channel_dim=-1,
-                        max_abs=10.0,
-                        min_prob=0.25,
-                    ),
-                    layer_norm_cls=IdentityNorm,
-                ),
-                num_layers=num_layers,
-                norm=BalancedBasicNorm(d_model) if norm_first else None,
-            )
-            self.decoder = nn.TransformerDecoder(
-                TransformerDecoderLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward=d_model * 4,
-                    dropout=0.1,
-                    batch_first=True,
-                    norm_first=norm_first,
-                    linear1_self_attention_cls=ScaledLinear,
-                    linear2_self_attention_cls=partial(
-                        ScaledLinear, initial_scale=0.01
-                    ),
-                    linear1_feedforward_cls=ScaledLinear,
-                    linear2_feedforward_cls=partial(
-                        ScaledLinear, initial_scale=0.01
-                    ),
-                    activation=partial(
-                        BalancedDoubleSwish,
-                        channel_dim=-1,
-                        max_abs=10.0,
-                        min_prob=0.25,
-                    ),
-                    layer_norm_cls=IdentityNorm,
-                ),
-                num_layers=num_layers,
-                norm=BalancedBasicNorm(d_model) if norm_first else None,
-            )
-            self.predict_layer = ScaledLinear(d_model, NUM_MEL_BINS)
-            self.stop_layer = nn.Linear(d_model, 1)
-        else:
-            self.encoder = nn.TransformerEncoder(
-                nn.TransformerEncoderLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward=d_model * 4,
-                    activation=F.relu,
-                    dropout=0.1,
-                    batch_first=True,
-                    norm_first=norm_first,
-                ),
-                num_layers=num_layers,
-                norm=nn.LayerNorm(d_model) if norm_first else None,
-            )
-            self.decoder = nn.TransformerDecoder(
-                nn.TransformerDecoderLayer(
-                    d_model,
-                    nhead,
-                    dim_feedforward=d_model * 4,
-                    activation=F.relu,
-                    dropout=0.1,
-                    batch_first=True,
-                    norm_first=norm_first,
-                ),
-                num_layers=num_layers,
-                norm=nn.LayerNorm(d_model) if norm_first else None,
-            )
-            self.predict_layer = nn.Linear(d_model, NUM_MEL_BINS)
-            self.stop_layer = nn.Linear(d_model, 1)
-        self.stop_accuracy_metric = BinaryAccuracy(
-            threshold=0.5, multidim_average="global"
-        )
-    #     self.apply(self._init_weights)
-    # def _init_weights(self, module):
-    #     if isinstance(module, (nn.Linear)):
-    #         module.weight.data.normal_(mean=0.0, std=0.02)
-    #         if isinstance(module, nn.Linear) and module.bias is not None:
-    #             module.bias.data.zero_()
-    #     elif isinstance(module, nn.LayerNorm):
-    #         module.bias.data.zero_()
-    #         module.weight.data.fill_(1.0)
-    #     elif isinstance(module, nn.Embedding):
-    #         module.weight.data.normal_(mean=0.0, std=0.02)
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: torch.Tensor,
-        y_lens: torch.Tensor,
-        reduction: str = "sum",
-        train_stage: int = 0,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
-        """
-        Args:
-          x:
-            A 2-D tensor of shape (N, S).
-          x_lens:
-            A 1-D tensor of shape (N,). It contains the number of tokens in `x`
-            before padding.
-          y:
-            A 3-D tensor of shape (N, T, 8).
-          y_lens:
-            A 1-D tensor of shape (N,). It contains the number of tokens in `x`
-            before padding.
-          train_stage:
-            Not used in this model.
-        Returns:
-          Return the predicted audio code matrix, cross-entropy loss and Top-10 accuracy.
-        """
-        del train_stage
-        assert x.ndim == 2, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.ndim == 3, y.shape
-        assert y_lens.ndim == 1, y_lens.shape
-        assert torch.all(x_lens > 0)
-        # NOTE: x has been padded in TextTokenCollater
-        x_mask = make_pad_mask(x_lens).to(x.device)
-        x = self.text_embedding(x)
-        x = self.encoder_prenet(x)
-        x = self.encoder_position(x)
-        x = self.encoder(x, src_key_padding_mask=x_mask)
-        total_loss, metrics = 0.0, {}
-        y_mask = make_pad_mask(y_lens).to(y.device)
-        y_mask_float = y_mask.type(torch.float32)
-        data_mask = 1.0 - y_mask_float.unsqueeze(-1)
-        # Training
-        # AR Decoder
-        def pad_y(y):
-            y = F.pad(y, (0, 0, 1, 0, 0, 0), value=0).detach()
-            # inputs, targets
-            return y[:, :-1], y[:, 1:]
-        y, targets = pad_y(y * data_mask)  # mask padding as zeros
-        y_emb = self.decoder_prenet(y)
-        y_pos = self.decoder_position(y_emb)
-        y_len = y_lens.max()
-        tgt_mask = torch.triu(
-            torch.ones(y_len, y_len, device=y.device, dtype=torch.bool),
-            diagonal=1,
-        )
-        y_dec = self.decoder(
-            y_pos,
-            x,
-            tgt_mask=tgt_mask,
-            memory_key_padding_mask=x_mask,
-        )
-        predict = self.predict_layer(y_dec)
-        # loss
-        total_loss = F.mse_loss(predict, targets, reduction=reduction)
-        logits = self.stop_layer(y_dec).squeeze(-1)
-        stop_loss = F.binary_cross_entropy_with_logits(
-            logits,
-            y_mask_float.detach(),
-            weight=1.0 + y_mask_float.detach() * 4.0,
-            reduction=reduction,
-        )
-        metrics["stop_loss"] = stop_loss.detach()
-        stop_accuracy = self.stop_accuracy_metric(
-            (torch.sigmoid(logits) >= 0.5).type(torch.int64),
-            y_mask.type(torch.int64),
-        )
-        # icefall MetricsTracker.norm_items()
-        metrics["stop_accuracy"] = stop_accuracy.item() * y_lens.sum().type(
-            torch.float32
-        )
-        return ((x, predict), total_loss + 100.0 * stop_loss, metrics)
-    def inference(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: Any = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 2-D tensor of shape (1, S).
-          x_lens:
-            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
-            before padding.
-        Returns:
-          Return the predicted audio code matrix and cross-entropy loss.
-        """
-        assert x.ndim == 2, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert torch.all(x_lens > 0)
-        x_mask = make_pad_mask(x_lens).to(x.device)
-        x = self.text_embedding(x)
-        x = self.encoder_prenet(x)
-        x = self.encoder_position(x)
-        x = self.encoder(x, src_key_padding_mask=x_mask)
-        x_mask = make_pad_mask(x_lens).to(x.device)
-        # AR Decoder
-        # TODO: Managing decoder steps avoid repetitive computation
-        y = torch.zeros(
-            [x.shape[0], 1, NUM_MEL_BINS], dtype=torch.float32, device=x.device
-        )
-        while True:
-            y_emb = self.decoder_prenet(y)
-            y_pos = self.decoder_position(y_emb)
-            tgt_mask = torch.triu(
-                torch.ones(
-                    y.shape[1], y.shape[1], device=y.device, dtype=torch.bool
-                ),
-                diagonal=1,
-            )
-            y_dec = self.decoder(
-                y_pos,
-                x,
-                tgt_mask=tgt_mask,
-                memory_mask=None,
-                memory_key_padding_mask=x_mask,
-            )
-            predict = self.predict_layer(y_dec[:, -1:])
-            logits = self.stop_layer(y_dec[:, -1:]) > 0  # sigmoid(0.0) = 0.5
-            if y.shape[1] > x_lens.max() * 10 or all(logits.cpu().numpy()):
-                print(
-                    f"TransformerTTS EOS [Text {x_lens[0]} -> Audio {y.shape[1]}]"
-                )
-                break
-            y = torch.concat([y, predict], dim=1)
-        return y[:, 1:]
-    def visualize(
-        self,
-        predicts: Tuple[torch.Tensor],
-        batch: Dict[str, Union[List, torch.Tensor]],
-        output_dir: str,
-        limit: int = 4,
-    ) -> None:
-        visualize(predicts, batch, output_dir, limit=limit)

models/vallex.py DELETED Viewed

@@ -1,853 +0,0 @@
-# Copyright    2023                             (authors: Feiteng Li)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-from typing import Dict, Iterator, List, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-# from icefall.utils import make_pad_mask
-# from torchmetrics.classification import MulticlassAccuracy
-from data.input_strategies import PromptedFeatures
-from modules.embedding import SinePositionalEmbedding, TokenEmbedding
-from modules.transformer import (
-    AdaptiveLayerNorm,
-    LayerNorm,
-    TransformerDecoderLayer,
-    TransformerEncoder,
-    TransformerEncoderLayer,
-)
-from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
-from .visualizer import visualize
-class Transpose(nn.Identity):
-    """(N, T, D) -> (N, D, T)"""
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return input.transpose(1, 2)
-# NOTE: There are two ways to implement the model
-#       1) [VALL-F] standard TransformerDecoder, use x as memory
-#       2) [VALL-E] modified TransformerDecoder like GPT-x(e.g. causal TransformerEncoder),
-#          use x as the prefix of decoder inputs
-class VALLF(nn.Module):
-    """It implements https://arxiv.org/abs/2301.02111
-    "Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers"
-    """
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        num_layers: int,
-        norm_first: bool = True,
-        add_prenet: bool = False,
-        decoder_cls: Union[
-            nn.TransformerDecoder, nn.TransformerEncoder
-        ] = nn.TransformerDecoder,
-        decoder_layer_cls: Union[
-            TransformerDecoderLayer, TransformerEncoderLayer
-        ] = TransformerDecoderLayer,
-        prefix_mode: int = 0,
-        share_embedding: bool = True,
-        nar_scale_factor: float = 1.0,
-        prepend_bos: bool = True,
-        num_quantizers: int = 8,
-    ):
-        """
-        Args:
-          d_model:
-            The number of expected features in the input (required).
-          nhead:
-            The number of heads in the multiheadattention models (required).
-          num_layers:
-            The number of sub-decoder-layers in the decoder (required).
-        """
-        super().__init__()
-        nar_d_model = int(d_model * nar_scale_factor)
-        self.ar_text_embedding = TokenEmbedding(d_model, NUM_TEXT_TOKENS)  # W_x
-        self.nar_text_embedding = TokenEmbedding(nar_d_model, NUM_TEXT_TOKENS)
-        # ID NUM_AUDIO_TOKENS     -> PAD
-        # ID NUM_AUDIO_TOKENS + 1 -> BOS
-        self.ar_audio_prepend_bos = prepend_bos
-        self.ar_audio_embedding = TokenEmbedding(
-            d_model, NUM_AUDIO_TOKENS + 1 + int(prepend_bos)
-        )
-        # PreNet
-        if add_prenet:
-            self.ar_text_prenet = nn.Sequential(
-                Transpose(),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
-                nn.BatchNorm1d(d_model),
-                nn.ReLU(),
-                nn.Dropout(0.5),
-                Transpose(),
-                nn.Linear(d_model, d_model),
-            )
-            self.ar_audio_prenet = nn.Sequential(
-                nn.Linear(d_model, 256),
-                nn.ReLU(),
-                nn.Dropout(0.25),
-                nn.Linear(256, 256),
-                nn.ReLU(),
-                nn.Dropout(0.25),
-                nn.Linear(256, d_model),
-            )
-        else:
-            self.ar_text_prenet = nn.Identity()
-            self.ar_audio_prenet = nn.Identity()
-        self.ar_text_position = SinePositionalEmbedding(
-            d_model,
-            dropout=0.1,
-            scale=False,
-            alpha=True,
-        )
-        self.ar_audio_position = SinePositionalEmbedding(
-            d_model,
-            dropout=0.1,
-            scale=False,
-            alpha=True,
-        )
-        self.ar_decoder = decoder_cls(
-            decoder_layer_cls(
-                d_model,
-                nhead,
-                dim_feedforward=d_model * 4,
-                dropout=0.1,
-                batch_first=True,
-                norm_first=norm_first,
-            ),
-            num_layers=num_layers,
-            norm=LayerNorm(d_model) if norm_first else None,
-        )
-        self.ar_predict_layer = nn.Linear(
-            d_model, NUM_AUDIO_TOKENS + 1, bias=False
-        )
-        self.rng = random.Random(0)
-        self.num_heads = nhead
-        self.prefix_mode = prefix_mode
-        self.num_quantizers = num_quantizers
-        assert num_quantizers >= 1
-        if num_quantizers > 1:
-            self.nar_audio_embeddings = nn.ModuleList(
-                [TokenEmbedding(nar_d_model, NUM_AUDIO_TOKENS + 1)]
-                + [
-                    TokenEmbedding(nar_d_model, NUM_AUDIO_TOKENS)
-                    for i in range(num_quantizers - 1)
-                ]
-            )  # W_a
-            # PreNet
-            if add_prenet:
-                self.nar_text_prenet = nn.Sequential(
-                    Transpose(),
-                    nn.Conv1d(
-                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
-                    ),
-                    nn.BatchNorm1d(nar_d_model),
-                    nn.ReLU(),
-                    nn.Dropout(0.5),
-                    nn.Conv1d(
-                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
-                    ),
-                    nn.BatchNorm1d(nar_d_model),
-                    nn.ReLU(),
-                    nn.Dropout(0.5),
-                    nn.Conv1d(
-                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
-                    ),
-                    nn.BatchNorm1d(nar_d_model),
-                    nn.ReLU(),
-                    nn.Dropout(0.5),
-                    Transpose(),
-                    nn.Linear(nar_d_model, nar_d_model),
-                )
-                self.nar_audio_prenet = nn.Sequential(
-                    nn.Linear(nar_d_model, 256),
-                    nn.ReLU(),
-                    nn.Dropout(0.25),
-                    nn.Linear(256, 256),
-                    nn.ReLU(),
-                    nn.Dropout(0.25),
-                    nn.Linear(256, nar_d_model),
-                )
-            else:
-                self.nar_text_prenet = nn.Identity()
-                self.nar_audio_prenet = nn.Identity()
-            self.nar_text_position = SinePositionalEmbedding(
-                nar_d_model,
-                dropout=0.0,
-                scale=False,
-                alpha=False,
-            )
-            self.nar_audio_position = SinePositionalEmbedding(
-                nar_d_model,
-                dropout=0.1,
-                scale=False,
-                alpha=False,
-            )
-            self.nar_decoder = decoder_cls(
-                decoder_layer_cls(
-                    nar_d_model,
-                    int(nhead * nar_scale_factor),
-                    dim_feedforward=nar_d_model * 4,
-                    dropout=0.1,
-                    batch_first=True,
-                    norm_first=norm_first,
-                    adaptive_layer_norm=True,
-                ),
-                num_layers=int(num_layers * nar_scale_factor),
-                norm=AdaptiveLayerNorm(
-                    nar_d_model, norm=nn.LayerNorm(nar_d_model)
-                )
-                if norm_first
-                else None,
-            )
-            self.nar_predict_layers = nn.ModuleList(
-                [
-                    nn.Linear(nar_d_model, NUM_AUDIO_TOKENS, bias=False)
-                    for i in range(num_quantizers - 1)
-                ]
-            )
-            self.nar_stage_embeddings = nn.ModuleList(
-                [
-                    TokenEmbedding(nar_d_model, 1)
-                    for i in range(num_quantizers - 1)
-                ]
-            )
-            if share_embedding:
-                # We share the parameters of the output projection layer with the parameters of the acoustic embedding Wa
-                # NOTE(Feiteng): In the experiment, this undermines accuracy
-                # self.ar_predict_layer.weight = self.ar_audio_embedding.weight
-                # We also share the parameters of the acoustic embedding layer and the output prediction layer,
-                # which means the weights of the j-th prediction layer are the same as the (j + 1)-th acoustic embedding layer.
-                for j in range(0, num_quantizers - 2):
-                    self.nar_predict_layers[
-                        j
-                    ].weight = self.nar_audio_embeddings[j + 2].weight
-    def stage_parameters(self, stage: int = 1) -> Iterator[nn.Parameter]:
-        assert stage > 0
-        if stage == 1:
-            for name, param in self.named_parameters():
-                if name.startswith("ar_"):
-                    print(f" AR parameter: {name}")
-                    yield param
-        if stage == 2:
-            for name, param in self.named_parameters():
-                if name.startswith("nar_"):
-                    print(f"NAR parameter: {name}")
-                    yield param
-    def stage_named_parameters(
-        self, stage: int = 1
-    ) -> Iterator[Tuple[str, nn.Parameter]]:
-        assert stage > 0
-        if stage == 1:
-            for pair in self.named_parameters():
-                if pair[0].startswith("ar_"):
-                    yield pair
-        if stage == 2:
-            for pair in self.named_parameters():
-                if pair[0].startswith("nar_"):
-                    yield pair
-    def pad_y_eos(self, y, y_mask_int, eos_id):
-        targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(
-            y_mask_int, (0, 1), value=1
-        )
-        # inputs, targets
-        if self.ar_audio_prepend_bos:
-            return (
-                F.pad(targets[:, :-1], (1, 0), value=NUM_AUDIO_TOKENS + 1),
-                targets,
-            )
-        return targets[:, :-1], targets[:, 1:]
-    def _prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes, prefix_mode):
-        # 5.1 For the NAR acoustic prompt tokens, we select a random segment waveform of 3 seconds
-        # from the same utterance.
-        # We implement this differently.
-        if prefix_mode == 0:
-            # no prefix
-            prefix_len = 0
-            y_emb = self.nar_audio_embeddings[0](y)
-            for j in range(1, nar_stage):
-                # Formula (4) (5)
-                y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
-        elif prefix_mode == 1:
-            # prefix at begining
-            int_low = (0.25 * y_lens.min()).type(torch.int64).item()
-            prefix_len = torch.randint(0, int_low * 2, size=()).item()
-            prefix_len = min(prefix_len, 225)  # 24000/320 * 3s = 225 frames
-            y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
-            y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
-            for j in range(1, self.num_quantizers):
-                y_prompts += self.nar_audio_embeddings[j](
-                    codes[:, :prefix_len, j]
-                )
-                if j < nar_stage:
-                    y_emb += self.nar_audio_embeddings[j](
-                        codes[:, prefix_len:, j]
-                    )
-            y_emb = torch.concat([y_prompts, y_emb], axis=1)
-        elif prefix_mode in [2, 4]:
-            if prefix_mode == 2:
-                # random prefix
-                prefix_len = min(225, int(0.25 * y_lens.min().item()))
-                y_prompts_codes = []
-                for b in range(codes.shape[0]):
-                    start = self.rng.randint(0, y_lens[b].item() - prefix_len)
-                    y_prompts_codes.append(
-                        torch.clone(codes[b, start : start + prefix_len])
-                    )
-                    codes[
-                        b, start : start + prefix_len, nar_stage
-                    ] = NUM_AUDIO_TOKENS
-                y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
-            else:
-                prefix_len = y_prompts_codes.shape[1]
-            y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
-            y_emb = self.nar_audio_embeddings[0](y)
-            for j in range(1, self.num_quantizers):
-                y_prompts += self.nar_audio_embeddings[j](
-                    y_prompts_codes[..., j]
-                )
-                if j < nar_stage:
-                    y_emb += self.nar_audio_embeddings[j](codes[..., j])
-            y_emb = torch.concat([y_prompts, y_emb], axis=1)
-        else:
-            raise ValueError
-        return y_emb, prefix_len
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: Union[torch.Tensor, PromptedFeatures],
-        y_lens: Union[torch.Tensor, PromptedFeatures],
-        reduction: str = "sum",
-        train_stage: int = 0,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
-        raise NotImplementedError
-    def inference(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: torch.Tensor,
-        enroll_x_lens: Union[torch.Tensor, None] = None,
-        top_k: int = -100,
-        temperature: float = 1.0,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-    def visualize(
-        self,
-        predicts: Tuple[torch.Tensor],
-        batch: Dict[str, Union[List, torch.Tensor]],
-        output_dir: str,
-        limit: int = 4,
-    ) -> None:
-        raise NotImplementedError
-class VALLE(VALLF):
-    """It implements https://arxiv.org/abs/2301.02111
-    "Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers"
-    """
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        num_layers: int,
-        norm_first: bool = True,
-        add_prenet: bool = False,
-        prefix_mode: int = 0,
-        share_embedding: bool = True,
-        nar_scale_factor: float = 1.0,
-        **kwargs,
-    ):
-        """
-        Args:
-          d_model:
-            The number of expected features in the input (required).
-          nhead:
-            The number of heads in the multiheadattention models (required).
-          num_layers:
-            The number of sub-decoder-layers in the decoder (required).
-        """
-        super(VALLE, self).__init__(
-            d_model,
-            nhead,
-            num_layers,
-            norm_first=norm_first,
-            add_prenet=add_prenet,
-            decoder_cls=TransformerEncoder,
-            decoder_layer_cls=TransformerEncoderLayer,
-            prefix_mode=prefix_mode,
-            share_embedding=share_embedding,
-            nar_scale_factor=nar_scale_factor,
-            **kwargs,
-        )
-        self.language_ID = {
-            'en': 0,
-            'zh': 1,
-            'ja': 2,
-        }
-        self.ar_language_embedding = TokenEmbedding(d_model, len(self.language_ID))
-        self.nar_language_embedding = TokenEmbedding(d_model, len(self.language_ID))
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: Union[torch.Tensor, PromptedFeatures],
-        y_lens: Union[torch.Tensor, PromptedFeatures],
-        reduction: str = "sum",
-        train_stage: int = 0,
-        **kwargs,
-    ):
-        raise NotImplementedError
-    def inference(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: torch.Tensor,
-        enroll_x_lens: torch.Tensor,
-        top_k: int = -100,
-        temperature: float = 1.0,
-        prompt_language: str = None,
-        text_language: str = None,
-        best_of: int = 1,
-        length_penalty: float = 1.0,
-        return_worst: bool = False,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 2-D tensor of shape (1, S).
-          x_lens:
-            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
-            before padding.
-          y:
-            A 3-D tensor of shape (1, T, 8).
-          top_k: (`optional`) int
-            The number of highest probability tokens to keep for top-k-filtering. Default to -100.
-          temperature: (`optional`) float
-            The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
-        Returns:
-          Return the predicted audio code matrix.
-        """
-        assert x.ndim == 2, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.ndim == 3, y.shape
-        assert y.shape[0] == 1, y.shape
-        assert torch.all(x_lens > 0)
-        # NOTE: x has been padded in TextTokenCollater
-        text = x
-        x = self.ar_text_embedding(text)
-        # Add language embedding
-        prompt_language_id = torch.LongTensor(np.array([self.language_ID[prompt_language]])).to(x.device)
-        if isinstance(text_language, str):
-            text_language_id = torch.LongTensor(np.array([self.language_ID[text_language]])).to(x.device)
-        elif isinstance(text_language, List):
-            text_language_id = torch.LongTensor(np.array([self.language_ID[tl] for tl in text_language])).to(x.device)
-        x[:, :enroll_x_lens, :] += self.ar_language_embedding(prompt_language_id)
-        x[:, enroll_x_lens:, :] += self.ar_language_embedding(text_language_id)
-        x = self.ar_text_prenet(x)
-        x = self.ar_text_position(x)
-        text_len = x_lens.max()
-        prompts = y
-        prefix_len = y.shape[1]
-        # AR Decoder
-        # TODO: Managing decoder steps avoid repetitive computation
-        y = prompts[..., 0]
-        if self.ar_audio_prepend_bos:
-            y = F.pad(y, (1, 0), value=NUM_AUDIO_TOKENS + 1)
-        x_len = x_lens.max()
-        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
-        kv_cache = None
-        use_kv_caching = True
-        sum_logprobs = torch.zeros(best_of, device=y.device)  # implement batch decoding here
-        x = x.repeat(best_of, 1, 1)
-        y = y.repeat(best_of, 1)
-        while True:
-            y_emb = self.ar_audio_embedding(y)
-            y_emb = self.ar_audio_prenet(y_emb)
-            y_pos = self.ar_audio_position(y_emb)
-            xy_pos = torch.concat([x, y_pos], dim=1)
-            y_len = y.shape[1]
-            x_attn_mask_pad = F.pad(
-                x_attn_mask,
-                (0, y_len),
-                value=True,
-            )
-            y_attn_mask = F.pad(
-                torch.triu(
-                    torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1
-                ),
-                (x_len, 0),
-                value=False,
-            )
-            xy_attn_mask = torch.concat(
-                [x_attn_mask_pad, y_attn_mask], dim=0
-            ).to(y.device)
-            if use_kv_caching and kv_cache is not None:
-                xy_pos = xy_pos[:, [-1]]
-            else:
-                pass
-            xy_dec, kv_cache = self.ar_decoder.infer(
-                xy_pos,
-                mask=xy_attn_mask,
-                past_kv=kv_cache,
-                use_cache=use_kv_caching,
-            )
-            # xy_dec, _ = self.ar_decoder(
-            #     (xy_pos, None),
-            #     mask=xy_attn_mask,
-            # )
-            logits = self.ar_predict_layer(xy_dec[:, -1])
-            samples, current_logprobs = topk_sampling(
-                logits, top_k=top_k, top_p=1, temperature=temperature
-            )
-            sum_logprobs += current_logprobs * (y[:, -1] != NUM_AUDIO_TOKENS)
-            samples[y[:, -1] == NUM_AUDIO_TOKENS] = NUM_AUDIO_TOKENS
-            completed = (samples[:, -1] == NUM_AUDIO_TOKENS).all()
-            if (
-                completed
-                or (y.shape[1] - prompts.shape[1]) > x_lens.max() * 16
-            ):
-                if prompts.shape[1] == y.shape[1]:
-                    raise SyntaxError(
-                        "well trained model shouldn't reach here."
-                    )
-                lengths = torch.sum(y != NUM_AUDIO_TOKENS, dim=1)
-                avg_logprobs = sum_logprobs / lengths ** length_penalty
-                # choose the best beam according to sum_logprobs
-                best_beam = y[torch.argmax(avg_logprobs), :]
-                worst_beam = y[torch.argmin(avg_logprobs), :]
-                # strip all eos tokens
-                best_beam = best_beam[best_beam != NUM_AUDIO_TOKENS]
-                worst_beam = worst_beam[worst_beam != NUM_AUDIO_TOKENS]
-                if return_worst:
-                    y = worst_beam.unsqueeze(0)
-                else:
-                    y = best_beam.unsqueeze(0)
-                print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
-                break
-            y = torch.concat([y, samples], dim=1)
-        codes = [y[:, prefix_len + int(self.ar_audio_prepend_bos) :]]
-        if self.num_quantizers == 1:
-            return torch.stack(codes, dim=-1)
-        # Non-AR Decoders
-        y_emb = self.nar_audio_embeddings[0](
-            y[:, int(self.ar_audio_prepend_bos) :]
-        )
-        if self.prefix_mode in [2, 4]:  # Exclude enrolled_phonemes
-            enrolled_len = enroll_x_lens.max().item()
-            # SOS + Synthesis Text + EOS
-            text = torch.concat(
-                [
-                    text[:, :1],
-                    text[:, enrolled_len - 1 :],
-                ],
-                dim=1,
-            )
-            text_len = text_len - (enrolled_len - 2)
-            assert text.shape[0] == 1
-        x = self.nar_text_embedding(text)
-        # Add language embedding
-        prompt_language_id = torch.LongTensor(np.array([self.language_ID[prompt_language]])).to(x.device)
-        if isinstance(text_language, str):
-            text_language_id = torch.LongTensor(np.array([self.language_ID[text_language]])).to(x.device)
-        elif isinstance(text_language, List):
-            text_language_id = torch.LongTensor(np.array([self.language_ID[tl] for tl in text_language])).to(x.device)
-        x[:, :enroll_x_lens, :] += self.nar_language_embedding(prompt_language_id)
-        x[:, enroll_x_lens:, :] += self.nar_language_embedding(text_language_id)
-        x = self.nar_text_prenet(x)
-        x = self.nar_text_position(x)
-        if self.prefix_mode == 0:
-            for i, (predict_layer, embedding_layer) in enumerate(
-                zip(
-                    self.nar_predict_layers,
-                    self.nar_audio_embeddings[1:],
-                )
-            ):
-                y_pos = self.nar_audio_prenet(y_emb)
-                y_pos = self.nar_audio_position(y_pos)
-                xy_pos = torch.concat([x, y_pos], dim=1)
-                xy_dec, _ = self.nar_decoder(
-                    (xy_pos, self.nar_stage_embeddings[i].weight)
-                )
-                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
-                samples = torch.argmax(logits, dim=-1)
-                codes.append(samples)
-                if i < self.num_quantizers - 2:
-                    y_emb[:, :prefix_len] += embedding_layer(
-                        prompts[..., i + 1]
-                    )
-                    y_emb[:, prefix_len:] += embedding_layer(samples)
-        else:
-            for j in range(1, self.num_quantizers):
-                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](
-                    prompts[..., j]
-                )
-            for i, (predict_layer, embedding_layer) in enumerate(
-                zip(
-                    self.nar_predict_layers,
-                    self.nar_audio_embeddings[1:],
-                )
-            ):
-                y_pos = self.nar_audio_prenet(y_emb)
-                y_pos = self.nar_audio_position(y_pos)
-                xy_pos = torch.concat([x, y_pos], dim=1)
-                xy_dec, _ = self.nar_decoder(
-                    (xy_pos, self.nar_stage_embeddings[i].weight)
-                )
-                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
-                samples = torch.argmax(logits, dim=-1)
-                codes.append(samples)
-                if i < self.num_quantizers - 2:
-                    y_emb[:, prefix_len:] += embedding_layer(samples)
-        assert len(codes) == self.num_quantizers
-        return torch.stack(codes, dim=-1)
-    def continual(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        y: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Args:
-          x:
-            A 2-D tensor of shape (1, S).
-          x_lens:
-            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
-            before padding.
-          y:
-            A 3-D tensor of shape (1, T, 8).
-        Returns:
-          Return the predicted audio code matrix.
-        """
-        assert x.ndim == 2, x.shape
-        assert x_lens.ndim == 1, x_lens.shape
-        assert y.ndim == 3, y.shape
-        assert y.shape[0] == 1, y.shape
-        assert torch.all(x_lens > 0)
-        assert self.num_quantizers == 8
-        # NOTE: x has been padded in TextTokenCollater
-        text = x
-        x = self.ar_text_embedding(text)
-        x = self.ar_text_prenet(x)
-        x = self.ar_text_position(x)
-        text_len = x_lens.max()
-        prefix_len = min(int(y.shape[1] * 0.5), 3 * 75)
-        # AR Decoder
-        prompts = y[:, :prefix_len]
-        codes = [y[:, prefix_len:, 0]]
-        # Non-AR Decoders
-        x = self.nar_text_embedding(text)
-        x = self.nar_text_prenet(x)
-        x = self.nar_text_position(x)
-        y_emb = self.nar_audio_embeddings[0](y[..., 0])
-        if self.prefix_mode == 0:
-            for i, (predict_layer, embedding_layer) in enumerate(
-                zip(
-                    self.nar_predict_layers,
-                    self.nar_audio_embeddings[1:],
-                )
-            ):
-                y_pos = self.nar_audio_position(y_emb)
-                y_pos = self.nar_audio_prenet(y_pos)
-                xy_pos = torch.concat([x, y_pos], dim=1)
-                xy_dec, _ = self.nar_decoder(
-                    (xy_pos, self.nar_stage_embeddings[i].weight)
-                )
-                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
-                samples = torch.argmax(logits, dim=-1)
-                codes.append(samples)
-                if i < 6:
-                    y_emb[:, :prefix_len] += embedding_layer(
-                        prompts[..., i + 1]
-                    )
-                    y_emb[:, prefix_len:] += embedding_layer(samples)
-        else:
-            for j in range(1, 8):
-                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](
-                    prompts[..., j]
-                )
-            for i, (predict_layer, embedding_layer) in enumerate(
-                zip(
-                    self.nar_predict_layers,
-                    self.nar_audio_embeddings[1:],
-                )
-            ):
-                y_pos = self.nar_audio_prenet(y_emb)
-                y_pos = self.nar_audio_position(y_pos)
-                xy_pos = torch.concat([x, y_pos], dim=1)
-                xy_dec, _ = self.nar_decoder(
-                    (xy_pos, self.nar_stage_embeddings[i].weight)
-                )
-                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
-                samples = torch.argmax(logits, dim=-1)
-                codes.append(samples)
-                if i < 6:
-                    y_emb[:, prefix_len:] += embedding_layer(samples)
-        assert len(codes) == 8
-        return torch.stack(codes, dim=-1)
-# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
-def top_k_top_p_filtering(
-    logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
-):
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-    Args:
-        logits: logits distribution shape (batch size, vocabulary size)
-        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        Make sure we keep at least min_tokens_to_keep per batch example in the output
-    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(
-            max(top_k, min_tokens_to_keep), logits.size(-1)
-        )  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(
-            F.softmax(sorted_logits, dim=-1), dim=-1
-        )
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-            ..., :-1
-        ].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            1, sorted_indices, sorted_indices_to_remove
-        )
-        logits[indices_to_remove] = filter_value
-    return logits
-def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
-    # temperature: (`optional`) float
-    #     The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
-    # top_k: (`optional`) int
-    #     The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-    # top_p: (`optional`) float
-    #     The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-    # Temperature (higher temperature => more likely to sample low probability tokens)
-    if temperature != 1.0:
-        logits = logits / temperature
-    # Top-p/top-k filtering
-    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
-    # Sample
-    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
-    logprobs = F.log_softmax(logits.float(), dim=-1)
-    current_logprobs = logprobs[torch.arange(logprobs.shape[0]), token.squeeze(1)]
-    return token, current_logprobs

models/visualizer.py DELETED Viewed

@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023                           (authors: Feiteng Li)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Dict, List, Tuple, Union
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-def visualize(
-    predicts: Tuple[torch.Tensor],
-    batch: Dict[str, Union[List, torch.Tensor]],
-    output_dir: str,
-    limit: int = 4,
-) -> None:
-    text_tokens = batch["text_tokens"].to("cpu").detach().numpy()
-    text_tokens_lens = batch["text_tokens_lens"].to("cpu").detach().numpy()
-    audio_features = batch["audio_features"].to("cpu").detach().numpy()
-    audio_features_lens = (
-        batch["audio_features_lens"].to("cpu").detach().numpy()
-    )
-    assert text_tokens.ndim == 2
-    utt_ids, texts = batch["utt_id"], batch["text"]
-    encoder_outputs = predicts[0].to("cpu").type(torch.float32).detach().numpy()
-    decoder_outputs = predicts[1]
-    if isinstance(decoder_outputs, list):
-        decoder_outputs = decoder_outputs[-1]
-    decoder_outputs = (
-        decoder_outputs.to("cpu").type(torch.float32).detach().numpy()
-    )
-    vmin, vmax = 0, 1024  # Encodec
-    if decoder_outputs.dtype == np.float32:
-        vmin, vmax = -6, 0  # Fbank
-    num_figures = 3
-    for b, (utt_id, text) in enumerate(zip(utt_ids[:limit], texts[:limit])):
-        _ = plt.figure(figsize=(14, 8 * num_figures))
-        S = text_tokens_lens[b]
-        T = audio_features_lens[b]
-        # encoder
-        plt.subplot(num_figures, 1, 1)
-        plt.title(f"Text: {text}")
-        plt.imshow(
-            X=np.transpose(encoder_outputs[b]),
-            cmap=plt.get_cmap("jet"),
-            aspect="auto",
-            interpolation="nearest",
-        )
-        plt.gca().invert_yaxis()
-        plt.axvline(x=S - 0.4, linewidth=2, color="r")
-        plt.xlabel("Encoder Output")
-        plt.colorbar()
-        # decoder
-        plt.subplot(num_figures, 1, 2)
-        plt.imshow(
-            X=np.transpose(decoder_outputs[b]),
-            cmap=plt.get_cmap("jet"),
-            aspect="auto",
-            interpolation="nearest",
-            vmin=vmin,
-            vmax=vmax,
-        )
-        plt.gca().invert_yaxis()
-        plt.axvline(x=T - 0.4, linewidth=2, color="r")
-        plt.xlabel("Decoder Output")
-        plt.colorbar()
-        # target
-        plt.subplot(num_figures, 1, 3)
-        plt.imshow(
-            X=np.transpose(audio_features[b]),
-            cmap=plt.get_cmap("jet"),
-            aspect="auto",
-            interpolation="nearest",
-            vmin=vmin,
-            vmax=vmax,
-        )
-        plt.gca().invert_yaxis()
-        plt.axvline(x=T - 0.4, linewidth=2, color="r")
-        plt.xlabel("Decoder Target")
-        plt.colorbar()
-        plt.savefig(f"{output_dir}/{utt_id}.png")
-        plt.close()

modules/__init__.py DELETED Viewed

File without changes

modules/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (175 Bytes)

modules/__pycache__/activation.cpython-311.pyc DELETED Viewed

Binary file (27.5 kB)

modules/__pycache__/embedding.cpython-311.pyc DELETED Viewed

Binary file (6.15 kB)

modules/__pycache__/scaling.cpython-311.pyc DELETED Viewed

Binary file (69 kB)

modules/__pycache__/transformer.cpython-311.pyc DELETED Viewed

Binary file (28.2 kB)

modules/activation.py DELETED Viewed

@@ -1,612 +0,0 @@
-from typing import Optional, Tuple, List
-import math
-import torch
-from torch import Tensor
-from torch.nn import Linear, Module
-from torch.nn import functional as F
-from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
-from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
-from torch.nn.parameter import Parameter
-def _in_projection_packed(
-    q: Tensor,
-    k: Tensor,
-    v: Tensor,
-    w: Tensor,
-    b: Optional[Tensor] = None,
-) -> List[Tensor]:
-    r"""
-    Performs the in-projection step of the attention operation, using packed weights.
-    Output is a triple containing projection tensors for query, key and value.
-    Args:
-        q, k, v: query, key and value tensors to be projected. For self-attention,
-            these are typically the same tensor; for encoder-decoder attention,
-            k and v are typically the same tensor. (We take advantage of these
-            identities for performance if they are present.) Regardless, q, k and v
-            must share a common embedding dimension; otherwise their shapes may vary.
-        w: projection weights for q, k and v, packed into a single tensor. Weights
-            are packed along dimension 0, in q, k, v order.
-        b: optional projection biases for q, k and v, packed into a single tensor
-            in q, k, v order.
-    Shape:
-        Inputs:
-        - q: :math:`(..., E)` where E is the embedding dimension
-        - k: :math:`(..., E)` where E is the embedding dimension
-        - v: :math:`(..., E)` where E is the embedding dimension
-        - w: :math:`(E * 3, E)` where E is the embedding dimension
-        - b: :math:`E * 3` where E is the embedding dimension
-        Output:
-        - in output list :math:`[q', k', v']`, each output tensor will have the
-            same shape as the corresponding input tensor.
-    """
-    E = q.size(-1)
-    if k is v:
-        if q is k:
-            # self-attention
-            return F.linear(q, w, b).chunk(3, dim=-1)
-        else:
-            # encoder-decoder attention
-            w_q, w_kv = w.split([E, E * 2])
-            if b is None:
-                b_q = b_kv = None
-            else:
-                b_q, b_kv = b.split([E, E * 2])
-            return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1)
-    else:
-        w_q, w_k, w_v = w.chunk(3)
-        if b is None:
-            b_q = b_k = b_v = None
-        else:
-            b_q, b_k, b_v = b.chunk(3)
-        return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
-def _scaled_dot_product_attention(
-    q: Tensor,
-    k: Tensor,
-    v: Tensor,
-    attn_mask: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-) -> Tuple[Tensor, Tensor]:
-    r"""
-    Computes scaled dot product attention on query, key and value tensors, using
-    an optional attention mask if passed, and applying dropout if a probability
-    greater than 0.0 is specified.
-    Returns a tensor pair containing attended values and attention weights.
-    Args:
-        q, k, v: query, key and value tensors. See Shape section for shape details.
-        attn_mask: optional tensor containing mask values to be added to calculated
-            attention. May be 2D or 3D; see Shape section for details.
-        dropout_p: dropout probability. If greater than 0.0, dropout is applied.
-    Shape:
-        - q: :math:`(B, Nt, E)` where B is batch size, Nt is the target sequence length,
-            and E is embedding dimension.
-        - key: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
-            and E is embedding dimension.
-        - value: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
-            and E is embedding dimension.
-        - attn_mask: either a 3D tensor of shape :math:`(B, Nt, Ns)` or a 2D tensor of
-            shape :math:`(Nt, Ns)`.
-        - Output: attention values have shape :math:`(B, Nt, E)`; attention weights
-            have shape :math:`(B, Nt, Ns)`
-    """
-    B, Nt, E = q.shape
-    q = q / math.sqrt(E)
-    # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
-    if attn_mask is not None:
-        attn = torch.baddbmm(attn_mask, q, k.transpose(-2, -1))
-    else:
-        attn = torch.bmm(q, k.transpose(-2, -1))
-    attn = F.softmax(attn, dim=-1)
-    if dropout_p > 0.0:
-        attn = F.dropout(attn, p=dropout_p)
-    # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
-    output = torch.bmm(attn, v)
-    return output, attn
-def multi_head_attention_forward(
-        x,
-        ipw,
-        ipb,
-        opw,
-        opb,
-        n_head,
-        attn_mask,
-        past_kv=None,
-        use_cache=False,
-):
-    # x = x.transpose(1, 0)
-    # tgt_len, bsz, embed_dim = x.shape
-    # head_dim = embed_dim // n_head
-    # q, k, v = _in_projection_packed(x, x, x, ipw, ipb)
-    # q = q.contiguous().view(tgt_len, bsz * n_head, head_dim).transpose(0, 1)
-    # k = k.contiguous().view(k.shape[0], bsz * n_head, head_dim).transpose(0, 1)
-    # v = v.contiguous().view(v.shape[0], bsz * n_head, head_dim).transpose(0, 1)
-    # new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
-    # new_attn_mask.masked_fill_(attn_mask, float("-inf"))
-    # attn_mask = new_attn_mask
-    #
-    # attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, 0.0)
-    # attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-    # attn_output = torch._C._nn.linear(attn_output, opw, opb)
-    # attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
-    B, T, C = x.size()
-    q, k, v = torch._C._nn.linear(x, ipw, ipb).chunk(3, dim=-1)
-    k = k.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
-    q = q.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
-    v = v.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
-    if past_kv is not None:
-        past_key = past_kv[0]
-        past_value = past_kv[1]
-        k = torch.cat((past_key, k), dim=-2)
-        v = torch.cat((past_value, v), dim=-2)
-    FULL_T = k.shape[-2]
-    if use_cache is True:
-        present = (k, v)
-    else:
-        present = None
-    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-    att = att.masked_fill(attn_mask[FULL_T - T:FULL_T, :FULL_T], float('-inf'))
-    att = F.softmax(att, dim=-1)
-    y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-    y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
-    y = torch._C._nn.linear(y, opw, opb)
-    return (y, present)
-class MultiheadAttention(Module):
-    r"""Allows the model to jointly attend to information
-    from different representation subspaces as described in the paper:
-    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    Multi-Head Attention is defined as:
-    .. math::
-        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
-    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
-    ``forward()`` will use a special optimized implementation if all of the following
-    conditions are met:
-    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
-      restriction will be loosened in the future.)
-    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
-    - training is disabled (using ``.eval()``)
-    - dropout is 0
-    - ``add_bias_kv`` is ``False``
-    - ``add_zero_attn`` is ``False``
-    - ``batch_first`` is ``True`` and the input is batched
-    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
-    - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
-    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
-      nor ``attn_mask`` is passed
-    If the optimized implementation is in use, a
-    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
-    ``query``/``key``/``value`` to represent padding more efficiently than using a
-    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
-    will be returned, and an additional speedup proportional to the fraction of the input
-    that is padding can be expected.
-    Args:
-        embed_dim: Total dimension of the model.
-        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
-            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
-        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
-        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
-        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
-        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
-            Default: ``False``.
-        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
-        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
-        batch_first: If ``True``, then the input and output tensors are provided
-            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
-    Examples::
-        >>> # xdoctest: +SKIP
-        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
-    """
-    __constants__ = ["batch_first"]
-    bias_k: Optional[torch.Tensor]
-    bias_v: Optional[torch.Tensor]
-    def __init__(
-            self,
-            embed_dim,
-            num_heads,
-            dropout=0.0,
-            bias=True,
-            add_bias_kv=False,
-            add_zero_attn=False,
-            kdim=None,
-            vdim=None,
-            batch_first=False,
-            linear1_cls=Linear,
-            linear2_cls=Linear,
-            device=None,
-            dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(MultiheadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = (
-                self.kdim == embed_dim and self.vdim == embed_dim
-        )
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.batch_first = batch_first
-        self.head_dim = embed_dim // num_heads
-        assert (
-                self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        if add_bias_kv:
-            self.bias_k = Parameter(
-                torch.empty((1, 1, embed_dim), **factory_kwargs)
-            )
-            self.bias_v = Parameter(
-                torch.empty((1, 1, embed_dim), **factory_kwargs)
-            )
-        else:
-            self.bias_k = self.bias_v = None
-        if linear1_cls == Linear:
-            if not self._qkv_same_embed_dim:
-                self.q_proj_weight = Parameter(
-                    torch.empty((embed_dim, embed_dim), **factory_kwargs)
-                )
-                self.k_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.kdim), **factory_kwargs)
-                )
-                self.v_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.vdim), **factory_kwargs)
-                )
-                self.register_parameter("in_proj_weight", None)
-            else:
-                self.in_proj_weight = Parameter(
-                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
-                )
-                self.register_parameter("q_proj_weight", None)
-                self.register_parameter("k_proj_weight", None)
-                self.register_parameter("v_proj_weight", None)
-            if bias:
-                self.in_proj_bias = Parameter(
-                    torch.empty(3 * embed_dim, **factory_kwargs)
-                )
-            else:
-                self.register_parameter("in_proj_bias", None)
-            self.out_proj = NonDynamicallyQuantizableLinear(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
-            )
-            self._reset_parameters()
-        else:
-            if not self._qkv_same_embed_dim:
-                raise NotImplementedError
-            else:
-                self.in_proj_linear = linear1_cls(
-                    embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
-                )
-                self.in_proj_weight = self.in_proj_linear.weight
-                self.register_parameter("q_proj_weight", None)
-                self.register_parameter("k_proj_weight", None)
-                self.register_parameter("v_proj_weight", None)
-                if bias:
-                    self.in_proj_bias = self.in_proj_linear.bias
-                else:
-                    self.register_parameter("in_proj_bias", None)
-            self.out_proj = linear2_cls(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
-            )
-            if self.bias_k is not None:
-                xavier_normal_(self.bias_k)
-            if self.bias_v is not None:
-                xavier_normal_(self.bias_v)
-        self.add_zero_attn = add_zero_attn
-    def _reset_parameters(self):
-        if self._qkv_same_embed_dim:
-            xavier_uniform_(self.in_proj_weight)
-        else:
-            xavier_uniform_(self.q_proj_weight)
-            xavier_uniform_(self.k_proj_weight)
-            xavier_uniform_(self.v_proj_weight)
-        if self.in_proj_bias is not None:
-            constant_(self.in_proj_bias, 0.0)
-            constant_(self.out_proj.bias, 0.0)
-        if self.bias_k is not None:
-            xavier_normal_(self.bias_k)
-        if self.bias_v is not None:
-            xavier_normal_(self.bias_v)
-    def __setstate__(self, state):
-        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
-        if "_qkv_same_embed_dim" not in state:
-            state["_qkv_same_embed_dim"] = True
-        super(MultiheadAttention, self).__setstate__(state)
-    def forward(
-            self,
-            query: Tensor,
-            key: Tensor,
-            value: Tensor,
-            key_padding_mask: Optional[Tensor] = None,
-            need_weights: bool = True,
-            attn_mask: Optional[Tensor] = None,
-            average_attn_weights: bool = True,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        r"""
-        Args:
-            query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
-                or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
-                :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
-                Queries are compared against key-value pairs to produce the output.
-                See "Attention Is All You Need" for more details.
-            key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
-                or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
-                :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
-                See "Attention Is All You Need" for more details.
-            value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
-                ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
-                sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
-                See "Attention Is All You Need" for more details.
-            key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
-                to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
-                Binary and byte masks are supported.
-                For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
-                the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
-            need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
-                Default: ``True``.
-            attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
-                :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
-                :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
-                broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
-                Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
-                corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
-                corresponding position is not allowed to attend. For a float mask, the mask values will be added to
-                the attention weight.
-            average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
-                heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
-                effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
-        Outputs:
-            - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
-              :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
-              where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
-              embedding dimension ``embed_dim``.
-            - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
-              returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
-              :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
-              :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
-              head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
-            .. note::
-                `batch_first` argument is ignored for unbatched inputs.
-        """
-        is_batched = query.dim() == 3
-        if key_padding_mask is not None:
-            _kpm_dtype = key_padding_mask.dtype
-            if _kpm_dtype != torch.bool and not torch.is_floating_point(
-                    key_padding_mask
-            ):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
-        why_not_fast_path = ""
-        if not is_batched:
-            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
-        elif query is not key or key is not value:
-            # When lifting this restriction, don't forget to either
-            # enforce that the dtypes all match or test cases where
-            # they don't!
-            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
-        elif (
-                self.in_proj_bias is not None
-                and query.dtype != self.in_proj_bias.dtype
-        ):
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
-        elif (
-                self.in_proj_weight is not None
-                and query.dtype != self.in_proj_weight.dtype
-        ):
-            # this case will fail anyway, but at least they'll get a useful error message.
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
-        elif self.training:
-            why_not_fast_path = "training is enabled"
-        elif not self.batch_first:
-            why_not_fast_path = "batch_first was not True"
-        elif self.bias_k is not None:
-            why_not_fast_path = "self.bias_k was not None"
-        elif self.bias_v is not None:
-            why_not_fast_path = "self.bias_v was not None"
-        elif self.dropout:
-            why_not_fast_path = f"dropout was {self.dropout}, required zero"
-        elif self.add_zero_attn:
-            why_not_fast_path = "add_zero_attn was enabled"
-        elif not self._qkv_same_embed_dim:
-            why_not_fast_path = "_qkv_same_embed_dim was not True"
-        elif attn_mask is not None:
-            why_not_fast_path = "attn_mask was not None"
-        elif query.is_nested and key_padding_mask is not None:
-            why_not_fast_path = (
-                "key_padding_mask is not supported with NestedTensor input"
-            )
-        elif self.num_heads % 2 == 1:
-            why_not_fast_path = "num_heads is odd"
-        elif torch.is_autocast_enabled():
-            why_not_fast_path = "autocast is enabled"
-        if not why_not_fast_path:
-            tensor_args = (
-                query,
-                key,
-                value,
-                self.in_proj_weight,
-                self.in_proj_bias,
-                self.out_proj.weight,
-                self.out_proj.bias,
-            )
-            # We have to use list comprehensions below because TorchScript does not support
-            # generator expressions.
-            if torch.overrides.has_torch_function(tensor_args):
-                why_not_fast_path = "some Tensor argument has_torch_function"
-            elif not all(
-                    [
-                        (x is None or x.is_cuda or "cpu" in str(x.device))
-                        for x in tensor_args
-                    ]
-            ):
-                why_not_fast_path = (
-                    "some Tensor argument is neither CUDA nor CPU"
-                )
-            elif torch.is_grad_enabled() and any(
-                    [x is not None and x.requires_grad for x in tensor_args]
-            ):
-                why_not_fast_path = (
-                    "grad is enabled and at least one of query or the "
-                    "input/output projection weights or biases requires_grad"
-                )
-            if not why_not_fast_path:
-                return torch._native_multi_head_attention(
-                    query,
-                    key,
-                    value,
-                    self.embed_dim,
-                    self.num_heads,
-                    self.in_proj_weight,
-                    self.in_proj_bias,
-                    self.out_proj.weight,
-                    self.out_proj.bias,
-                    key_padding_mask
-                    if key_padding_mask is not None
-                    else attn_mask,
-                    need_weights,
-                    average_attn_weights,
-                    1
-                    if key_padding_mask is not None
-                    else 0
-                    if attn_mask is not None
-                    else None,
-                )
-        any_nested = query.is_nested or key.is_nested or value.is_nested
-        assert not any_nested, (
-                "MultiheadAttention does not support NestedTensor outside of its fast path. "
-                + f"The fast path was not hit because {why_not_fast_path}"
-        )
-        if self.batch_first and is_batched:
-            # make sure that the transpose op does not affect the "is" property
-            if key is value:
-                if query is key:
-                    query = key = value = query.transpose(1, 0)
-                else:
-                    query, key = [x.transpose(1, 0) for x in (query, key)]
-                    value = key
-            else:
-                query, key, value = [
-                    x.transpose(1, 0) for x in (query, key, value)
-                ]
-        if not self._qkv_same_embed_dim:
-            attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query,
-                key,
-                value,
-                self.embed_dim,
-                self.num_heads,
-                self.in_proj_weight,
-                self.in_proj_bias,
-                self.bias_k,
-                self.bias_v,
-                self.add_zero_attn,
-                self.dropout,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                training=self.training,
-                key_padding_mask=key_padding_mask,
-                need_weights=need_weights,
-                attn_mask=attn_mask,
-                use_separate_proj_weight=True,
-                q_proj_weight=self.q_proj_weight,
-                k_proj_weight=self.k_proj_weight,
-                v_proj_weight=self.v_proj_weight,
-                average_attn_weights=average_attn_weights,
-            )
-        else:
-            attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query,
-                key,
-                value,
-                self.embed_dim,
-                self.num_heads,
-                self.in_proj_weight,
-                self.in_proj_bias,
-                self.bias_k,
-                self.bias_v,
-                self.add_zero_attn,
-                self.dropout,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                training=self.training,
-                key_padding_mask=key_padding_mask,
-                need_weights=need_weights,
-                attn_mask=attn_mask,
-                average_attn_weights=average_attn_weights,
-            )
-        if self.batch_first and is_batched:
-            return attn_output.transpose(1, 0), attn_output_weights
-        else:
-            return attn_output, attn_output_weights
-    def infer(self,
-              x: Tensor,
-              key_padding_mask: Optional[Tensor] = None,
-              need_weights: bool = True,
-              attn_mask: Optional[Tensor] = None,
-              average_attn_weights: bool = True,
-              past_kv = None,
-              use_cache = False
-              ):
-        # x = x.transpose(1, 0)
-        y, kv = multi_head_attention_forward(
-                x=x,
-                ipw=self.in_proj_weight,
-                ipb=self.in_proj_bias,
-                opw=self.out_proj.weight,
-                opb=self.out_proj.bias,
-                n_head=self.num_heads,
-                attn_mask=attn_mask,
-                past_kv=past_kv,
-                use_cache=use_cache,
-        )
-        return (y, kv)

modules/embedding.py DELETED Viewed

@@ -1,97 +0,0 @@
-# Copyright    2023                             (authors: Feiteng Li)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import torch
-import torch.nn as nn
-class TokenEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim_model: int,
-        vocab_size: int,
-        dropout: float = 0.0,
-    ):
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.dim_model = dim_model
-        self.dropout = torch.nn.Dropout(p=dropout)
-        self.word_embeddings = nn.Embedding(self.vocab_size, self.dim_model)
-    @property
-    def weight(self) -> torch.Tensor:
-        return self.word_embeddings.weight
-    def embedding(self, index: int) -> torch.Tensor:
-        return self.word_embeddings.weight[index : index + 1]
-    def forward(self, x: torch.Tensor):
-        X = self.word_embeddings(x)
-        X = self.dropout(X)
-        return X
-class SinePositionalEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim_model: int,
-        dropout: float = 0.0,
-        scale: bool = False,
-        alpha: bool = False,
-    ):
-        super().__init__()
-        self.dim_model = dim_model
-        self.x_scale = math.sqrt(dim_model) if scale else 1.0
-        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
-        self.dropout = torch.nn.Dropout(p=dropout)
-        self.reverse = False
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, 4000))
-    def extend_pe(self, x):
-        """Reset the positional encodings."""
-        if self.pe is not None:
-            if self.pe.size(1) >= x.size(1):
-                if self.pe.dtype != x.dtype or self.pe.device != x.device:
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        pe = torch.zeros(x.size(1), self.dim_model)
-        if self.reverse:
-            position = torch.arange(
-                x.size(1) - 1, -1, -1.0, dtype=torch.float32
-            ).unsqueeze(1)
-        else:
-            position = torch.arange(
-                0, x.size(1), dtype=torch.float32
-            ).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.dim_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.dim_model)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        self.extend_pe(x)
-        output = x.unsqueeze(-1) if x.ndim == 2 else x
-        output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
-        return self.dropout(output)

modules/optim.py DELETED Viewed

@@ -1,1105 +0,0 @@
-# Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import logging
-import random
-from collections import defaultdict
-from typing import List, Optional, Tuple, Union
-import torch
-from lhotse.utils import fix_random_seed
-from torch import Tensor
-from torch.optim import Optimizer
-class BatchedOptimizer(Optimizer):
-    """
-    This class adds to class Optimizer the capability to optimize parameters in batches:
-    it will stack the parameters and their grads for you so the optimizer can work
-    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
-    as it reduces the number of kernels launched in the optimizer.
-    Args:
-      params:
-    """
-    def __init__(self, params, defaults):
-        super(BatchedOptimizer, self).__init__(params, defaults)
-    @contextlib.contextmanager
-    def batched_params(self, param_group, group_params_names):
-        """
-        This function returns (technically, yields) a list of
-          of tuples (p, state), where
-        p is a `fake` parameter that is stacked (over axis 0) from real parameters
-        that share the same shape, and its gradient is also stacked;
-        `state` is the state corresponding to this batch of parameters
-        (it will be physically located in the "state" for one of the real
-        parameters, the last one that has any particular shape and dtype).
-        This function is decorated as a context manager so that it can
-        write parameters back to their "real" locations.
-        The idea is, instead of doing:
-        <code>
-          for p in group["params"]:
-             state = self.state[p]
-             ...
-        </code>
-        you can do:
-        <code>
-          with self.batched_params(group["params"]) as batches:
-             for p, state, p_names in batches:
-                 ...
-        </code>
-        Args:
-          group: a parameter group, which is a list of parameters; should be
-                one of self.param_groups.
-          group_params_names: name for each parameter in group,
-                which is List[str].
-        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-        assert len(param_group) == len(group_params_names)
-        for p, named_p in zip(param_group, group_params_names):
-            key = (str(p.dtype), *p.shape)
-            batches[key].append(p)
-            batches_names[key].append(named_p)
-        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i]
-        )
-        batches_names = [
-            batches_names[batches_names_keys[idx]] for idx in sorted_idx
-        ]
-        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-        stacked_params_dict = dict()
-        # turn batches into a list, in deterministic order.
-        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
-        # one for each batch in `batches`.
-        tuples = []
-        for batch, batch_names in zip(batches, batches_names):
-            p = batch[0]
-            # we arbitrarily store the state in the
-            # state corresponding to the 1st parameter in the
-            # group.  class Optimizer will take care of saving/loading state.
-            state = self.state[p]
-            p_stacked = torch.stack(batch)
-            grad = torch.stack(
-                [
-                    torch.zeros_like(p) if p.grad is None else p.grad
-                    for p in batch
-                ]
-            )
-            p_stacked.grad = grad
-            stacked_params_dict[key] = p_stacked
-            tuples.append((p_stacked, state, batch_names))
-        yield tuples  # <-- calling code will do the actual optimization here!
-        for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
-            for i, p in enumerate(batch):  # batch is list of Parameter
-                p.copy_(stacked_params[i])
-class ScaledAdam(BatchedOptimizer):
-    """
-     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
-     proportional to the norm of that parameter; and also learn the scale of the parameter,
-     in log space, subject to upper and lower limits (as if we had factored each parameter as
-     param = underlying_param * log_scale.exp())
-     Args:
-          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
-              lr:  The learning rate.  We will typically use a learning rate schedule that starts
-                   at 0.03 and decreases over time, i.e. much higher than other common
-                   optimizers.
-     clipping_scale: (e.g. 2.0)
-                   A scale for gradient-clipping: if specified, the normalized gradients
-                   over the whole model will be clipped to have 2-norm equal to
-                   `clipping_scale` times the median 2-norm over the most recent period
-                   of `clipping_update_period` minibatches.  By "normalized gradients",
-                   we mean after multiplying by the rms parameter value for this tensor
-                   [for non-scalars]; this is appropriate because our update is scaled
-                   by this quantity.
-            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
-                   Must satisfy 0 < beta <= beta2 < 1.
-     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
-                   scale of each parameter tensor and scalar parameters of the mode..
-                   If each parameter were decomposed
-                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
-                   would be a the scaling factor on the learning rate of p_scale.
-              eps:  A general-purpose epsilon to prevent division by zero
-    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be >= this value)
-    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be <= this value)
-       scalar_max: Maximum absolute value for scalar parameters (applicable if your
-                   model has any parameters with numel() == 1).
-    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time
-                   in the update.
-     clipping_update_period: if clipping_scale is specified, this is the period
-    """
-    def __init__(
-        self,
-        params,
-        lr=3e-02,
-        clipping_scale=None,
-        betas=(0.9, 0.98),
-        scalar_lr_scale=0.1,
-        eps=1.0e-08,
-        param_min_rms=1.0e-05,
-        param_max_rms=3.0,
-        scalar_max=10.0,
-        size_update_period=4,
-        clipping_update_period=100,
-        parameters_names=None,
-        show_dominant_parameters=True,
-    ):
-        assert parameters_names is not None, (
-            "Please prepare parameters_names,"
-            "which is a List[List[str]]. Each List[str] is for a group"
-            "and each str is for a parameter"
-        )
-        defaults = dict(
-            lr=lr,
-            clipping_scale=clipping_scale,
-            betas=betas,
-            scalar_lr_scale=scalar_lr_scale,
-            eps=eps,
-            param_min_rms=param_min_rms,
-            param_max_rms=param_max_rms,
-            scalar_max=scalar_max,
-            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period,
-        )
-        super(ScaledAdam, self).__init__(params, defaults)
-        assert len(self.param_groups) == len(parameters_names)
-        self.parameters_names = parameters_names
-        self.show_dominant_parameters = show_dominant_parameters
-    def __setstate__(self, state):
-        super(ScaledAdam, self).__setstate__(state)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        batch = True
-        for group, group_params_names in zip(
-            self.param_groups, self.parameters_names
-        ):
-            with self.batched_params(
-                group["params"], group_params_names
-            ) as batches:
-                # batches is list of pairs (stacked_param, state).  stacked_param is like
-                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
-                # a stacking dim, it is not a real dim.
-                if (
-                    len(batches[0][1]) == 0
-                ):  # if len(first state) == 0: not yet initialized
-                    clipping_scale = 1
-                else:
-                    clipping_scale = self._get_clipping_scale(group, batches)
-                for p, state, _ in batches:
-                    # Perform optimization step.
-                    # grad is not going to be None, we handled that when creating the batches.
-                    grad = p.grad
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
-                    # State initialization
-                    if len(state) == 0:
-                        self._init_state(group, p, state)
-                    self._step_one_batch(group, p, state, clipping_scale)
-        return loss
-    def _init_state(self, group: dict, p: Tensor, state: dict):
-        """
-        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
-        is actually the batch dimension, corresponding to batched-together
-        parameters of a given shape.
-        Args:
-           group:   Dict to look up configuration values.
-               p: The parameter that we are initializing the state for
-           state: Dict from string to whatever state we are initializing
-        """
-        size_update_period = group["size_update_period"]
-        state["step"] = 0
-        kwargs = {"device": p.device, "dtype": p.dtype}
-        # 'delta' implements conventional momentum.  There are
-        # several different kinds of update going on, so rather than
-        # compute "exp_avg" like in Adam, we store and decay a
-        # parameter-change "delta", which combines all forms of
-        # update.  this is equivalent to how it's done in Adam,
-        # except for the first few steps.
-        state["delta"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format
-        )
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        numel = p.numel()
-        if numel > 1:
-            # "param_rms" just periodically records the scalar root-mean-square value of
-            # the parameter tensor.
-            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (
-                (p ** 2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-            )
-            state["param_rms"] = param_rms
-            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(
-                size_update_period, *param_rms.shape, **kwargs
-            )
-        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format
-        )
-    def _get_clipping_scale(
-        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
-    ) -> float:
-        """
-        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
-        by this amount before applying the rest of the update.
-        Args:
-           group: the parameter group, an item in self.param_groups
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-        """
-        assert len(tuples) >= 1
-        clipping_scale = group["clipping_scale"]
-        (first_p, first_state, _) = tuples[0]
-        step = first_state["step"]
-        if clipping_scale is None or step == 0:
-            # no clipping.  return early on step == 0 because the other
-            # parameters' state won't have been initialized yet.
-            return 1.0
-        clipping_update_period = group["clipping_update_period"]
-        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for (p, state, param_names) in tuples:
-            grad = p.grad
-            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients"
-                )
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                tot_sumsq += (
-                    grad ** 2
-                ).sum()  # sum() to change shape [1] to []
-            else:
-                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
-        tot_norm = tot_sumsq.sqrt()
-        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device
-            )
-        first_state["model_norms"][step % clipping_update_period] = tot_norm
-        if step % clipping_update_period == 0:
-            # Print some stats.
-            # We don't reach here if step == 0 because we would have returned
-            # above.
-            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
-            quartiles = []
-            for n in range(0, 5):
-                index = min(
-                    clipping_update_period - 1,
-                    (clipping_update_period // 4) * n,
-                )
-                quartiles.append(sorted_norms[index].item())
-            median = quartiles[2]
-            threshold = clipping_scale * median
-            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (
-                first_state["num_clipped"] * 100.0 / clipping_update_period
-                if "num_clipped" in first_state
-                else 0.0
-            )
-            first_state["num_clipped"] = 0
-            quartiles = " ".join(["%.3e" % x for x in quartiles])
-            logging.info(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
-            )
-        if step < clipping_update_period:
-            return 1.0  # We have not yet estimated a norm to clip to.
-        else:
-            try:
-                model_norm_threshold = first_state["model_norm_threshold"]
-            except KeyError:
-                logging.info(
-                    "Warning: model_norm_threshold not in state: possibly "
-                    "you changed config when restarting, adding clipping_scale option?"
-                )
-                return 1.0
-            ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
-            if ans < 1.0:
-                first_state["num_clipped"] += 1
-            if ans < 0.1:
-                logging.warn(
-                    f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-                )
-                if self.show_dominant_parameters:
-                    assert p.shape[0] == len(param_names)
-                    self._show_gradient_dominating_parameter(tuples, tot_sumsq)
-            return ans
-    def _show_gradient_dominating_parameter(
-        self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor
-    ):
-        """
-        Show information of parameter wihch dominanting tot_sumsq.
-        Args:
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-            tot_sumsq: sumsq of all parameters. Though it's could be calculated
-                from tuples, we still pass it to save some time.
-        """
-        all_sumsq_orig = {}
-        for (p, state, batch_param_names) in tuples:
-            # p is a stacked batch parameters.
-            batch_grad = p.grad
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                batch_sumsq_orig = batch_grad ** 2
-                # Dummpy values used by following `zip` statement.
-                batch_rms_orig = torch.ones(p.shape[0])
-            else:
-                batch_rms_orig = state["param_rms"]
-                batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(
-                    dim=list(range(1, batch_grad.ndim))
-                )
-            for name, sumsq_orig, rms, grad in zip(
-                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
-            ):
-                proportion_orig = sumsq_orig / tot_sumsq
-                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-        assert torch.isclose(
-            sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
-            torch.tensor(1.0),
-        )
-        sorted_by_proportion = {
-            k: v
-            for k, v in sorted(
-                all_sumsq_orig.items(),
-                key=lambda item: item[1][0],
-                reverse=True,
-            )
-        }
-        dominant_param_name = next(iter(sorted_by_proportion))
-        (
-            dominant_proportion,
-            dominant_sumsq,
-            dominant_rms,
-            dominant_grad,
-        ) = sorted_by_proportion[dominant_param_name]
-        logging.info(
-            f"Parameter Dominanting tot_sumsq {dominant_param_name}"
-            f" with proportion {dominant_proportion:.2f},"
-            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-            f"={dominant_sumsq:.3e},"
-            f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
-            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
-        )
-    def _step_one_batch(
-        self, group: dict, p: Tensor, state: dict, clipping_scale: float
-    ):
-        """
-        Do the step for one parameter, which is actually going to be a batch of
-        `real` parameters, with dim 0 as the batch dim.
-        Args:
-                  group:  dict to look up configuration values
-                    p: parameter to update (actually multiple parameters stacked together
-                       as a batch)
-                  state: state-dict for p, to look up the optimizer state
-        """
-        lr = group["lr"]
-        size_update_period = group["size_update_period"]
-        beta1 = group["betas"][0]
-        grad = p.grad
-        if clipping_scale != 1.0:
-            grad = grad * clipping_scale
-        step = state["step"]
-        delta = state["delta"]
-        delta.mul_(beta1)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        if numel > 1:
-            # Update the size/scale of p, and set param_rms
-            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True
-            )
-            if step % size_update_period == size_update_period - 1:
-                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_(
-                    (p ** 2)
-                    .mean(dim=list(range(1, p.ndim)), keepdim=True)
-                    .sqrt()
-                )
-                if step > 0:
-                    # self._size_update() learns the overall scale on the
-                    # parameter, by shrinking or expanding it.
-                    self._size_update(group, scale_grads, p, state)
-        if numel == 1:
-            # For parameters with 1 element we just use regular Adam.
-            # Updates delta.
-            self._step_scalar(group, p, state)
-        else:
-            self._step(group, p, state)
-        state["step"] = step + 1
-    def _size_update(
-        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
-    ) -> None:
-        """
-               Called only where p.numel() > 1, this updates the scale of the parameter.
-               If we imagine: p =  underlying_param * scale.exp(), and we are doing
-               gradient descent on underlying param and on scale, this function does the update
-               on `scale`.
-               Args:
-              group: dict to look up configuration values
-        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
-                      grads w.r.t. the scales.
-                  p:  The parameter to update
-               state: The state-dict of p
-        """
-        param_rms = state["param_rms"]
-        beta1, beta2 = group["betas"]
-        size_lr = group["lr"] * group["scalar_lr_scale"]
-        param_min_rms = group["param_min_rms"]
-        param_max_rms = group["param_max_rms"]
-        eps = group["eps"]
-        step = state["step"]
-        batch_size = p.shape[0]
-        size_update_period = scale_grads.shape[0]
-        # correct beta2 for the size update period: we will have
-        # faster decay at this level.
-        beta2_corr = beta2 ** size_update_period
-        scale_exp_avg_sq = state[
-            "scale_exp_avg_sq"
-        ]  # shape: (batch_size, 1, 1, ..)
-        scale_exp_avg_sq.mul_(beta2_corr).add_(
-            (scale_grads ** 2).mean(
-                dim=0
-            ),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr,
-        )  # shape is (batch_size, 1, 1, ...)
-        # The 1st time we reach here is when size_step == 1.
-        size_step = (step + 1) // size_update_period
-        bias_correction2 = 1 - beta2_corr ** size_step
-        # we don't bother with bias_correction1; this will help prevent divergence
-        # at the start of training.
-        denom = scale_exp_avg_sq.sqrt() + eps
-        scale_step = (
-            -size_lr
-            * (bias_correction2 ** 0.5)
-            * scale_grads.sum(dim=0)
-            / denom
-        )
-        is_too_small = param_rms < param_min_rms
-        is_too_large = param_rms > param_max_rms
-        # when the param gets too small, just don't shrink it any further.
-        scale_step.masked_fill_(is_too_small, 0.0)
-        # when it gets too large, stop it from getting any larger.
-        scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
-        delta = state["delta"]
-        # the factor of (1-beta1) relates to momentum.
-        delta.add_(p * scale_step, alpha=(1 - beta1))
-    def _step(self, group: dict, p: Tensor, state: dict):
-        """
-        This function does the core update of self.step(), in the case where the members of
-        the batch have more than 1 element.
-        Args:
-            group: A dict which will be used to look up configuration values
-                p: The parameter to be updated
-             grad: The grad of p
-            state: The state-dict corresponding to parameter p
-        This function modifies p.
-        """
-        grad = p.grad
-        lr = group["lr"]
-        beta1, beta2 = group["betas"]
-        eps = group["eps"]
-        param_min_rms = group["param_min_rms"]
-        step = state["step"]
-        exp_avg_sq = state["exp_avg_sq"]
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-        this_step = state["step"] - (
-            state["zero_step"] if "zero_step" in state else 0
-        )
-        bias_correction2 = 1 - beta2 ** (this_step + 1)
-        if bias_correction2 < 0.99:
-            # note: not in-place.
-            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-        denom = exp_avg_sq.sqrt()
-        denom += eps
-        grad = grad / denom
-        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-        delta = state["delta"]
-        delta.add_(grad * alpha)
-        p.add_(delta)
-    def _step_scalar(self, group: dict, p: Tensor, state: dict):
-        """
-        A simplified form of the core update for scalar tensors, where we cannot get a good
-        estimate of the parameter rms.
-        """
-        beta1, beta2 = group["betas"]
-        scalar_max = group["scalar_max"]
-        eps = group["eps"]
-        lr = group["lr"] * group["scalar_lr_scale"]
-        grad = p.grad
-        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
-        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
-        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-        delta = state["delta"]
-        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
-        p.clamp_(min=-scalar_max, max=scalar_max)
-        p.add_(delta)
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError(
-                "{} is not an Optimizer".format(type(optimizer).__name__)
-            )
-        self.optimizer = optimizer
-        self.verbose = verbose
-        for group in optimizer.param_groups:
-            group.setdefault("base_lr", group["lr"])
-        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
-        self.epoch = 0
-        self.batch = 0
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            logging.info(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-    The basic formula (before warmup) is:
-      lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                     (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-     E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-        self.warmup_batches = warmup_batches
-    def get_lr(self):
-        factor = (
-            (self.batch ** 2 + self.lr_batches ** 2) / self.lr_batches ** 2
-        ) ** -0.25 * (
-            ((self.epoch ** 2 + self.lr_epochs ** 2) / self.lr_epochs ** 2)
-            ** -0.25
-        )
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-        return [x * factor * warmup_factor for x in self.base_lrs]
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = ScaledAdam(m.parameters(), lr=0.03)
-    scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-    logging.info(f"last lr = {scheduler.get_last_lr()}")
-    logging.info(f"state dict = {scheduler.state_dict()}")
-# This is included mostly as a baseline for ScaledAdam.
-class Eve(Optimizer):
-    """
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-    .. _Adam: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "AdamW does not support sparse gradients"
-                    )
-                state = self.state[p]
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2 ** -0.5)).add_(
-                    group["eps"]
-                )
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (
-                        target_rms * (p.numel() ** 0.5)
-                    )
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-                # if random.random() < 0.0005:
-                #     step = (exp_avg / denom) * step_size
-                #     logging.info(
-                #         f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
-                #     )
-        return loss
-def _test_scaled_adam(hidden_dim: int):
-    import timeit
-    from scaling import ScaledLinear
-    E = 100
-    B = 4
-    T = 2
-    logging.info("in test_eve_cain")
-    # device = torch.device('cuda')
-    device = torch.device("cpu")
-    dtype = torch.float32
-    fix_random_seed(42)
-    # these input_magnitudes and output_magnitudes are to test that
-    # Abel is working as we expect and is able to adjust scales of
-    # different dims differently.
-    input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-    output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-    for iter in [1, 0]:
-        fix_random_seed(42)
-        Linear = torch.nn.Linear if iter == 0 else ScaledLinear
-        m = torch.nn.Sequential(
-            Linear(E, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, E),
-        ).to(device)
-        train_pairs = [
-            (
-                100.0
-                * torch.randn(B, T, E, device=device, dtype=dtype)
-                * input_magnitudes,
-                torch.randn(B, T, E, device=device, dtype=dtype)
-                * output_magnitudes,
-            )
-            for _ in range(20)
-        ]
-        if iter == 0:
-            optim = Eve(m.parameters(), lr=0.003)
-        elif iter == 1:
-            optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
-        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
-        start = timeit.default_timer()
-        avg_loss = 0.0
-        for epoch in range(180):
-            scheduler.step_epoch()
-            # if epoch == 100 and iter in [2,3]:
-            #    optim.reset_speedup()  # check it doesn't crash.
-            # if epoch == 130:
-            #    opts = diagnostics.TensorDiagnosticOptions(
-            #        2 ** 22
-            #    )  # allow 4 megabytes per sub-module
-            #    diagnostic = diagnostics.attach_diagnostics(m, opts)
-            for n, (x, y) in enumerate(train_pairs):
-                y_out = m(x)
-                loss = ((y_out - y) ** 2).mean() * 100.0
-                if epoch == 0 and n == 0:
-                    avg_loss = loss.item()
-                else:
-                    avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
-                if n == 0 and epoch % 5 == 0:
-                    # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
-                    # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
-                    # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
-                    # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
-                    # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
-                    # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
-                    # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
-                    # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
-                    lr = scheduler.get_last_lr()[0]
-                    logging.info(
-                        f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
-                    )  # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
-                loss.log().backward()
-                optim.step()
-                optim.zero_grad()
-                scheduler.step_batch()
-        # diagnostic.print_diagnostics()
-        stop = timeit.default_timer()
-        logging.info(f"Iter={iter}, Time taken: {stop - start}")
-        logging.info(f"last lr = {scheduler.get_last_lr()}")
-        # logging.info("state dict = ", scheduler.state_dict())
-        # logging.info("optim state_dict = ", optim.state_dict())
-        logging.info(f"input_magnitudes = {input_magnitudes}")
-        logging.info(f"output_magnitudes = {output_magnitudes}")
-if __name__ == "__main__":
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    logging.getLogger().setLevel(logging.INFO)
-    import subprocess
-    s = subprocess.check_output(
-        "git status -uno .; git log -1; git diff HEAD .", shell=True
-    )
-    logging.info(s)
-    import sys
-    if len(sys.argv) > 1:
-        hidden_dim = int(sys.argv[1])
-    else:
-        hidden_dim = 200
-    _test_scaled_adam(hidden_dim)
-    _test_eden()

modules/scaling.py DELETED Viewed

@@ -1,1401 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import logging
-import random
-import math
-from functools import reduce
-from itertools import repeat
-from typing import Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.nn import Embedding as ScaledEmbedding
-from utils import Transpose
-class ActivationBalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        scale_factor: Tensor,
-        sign_factor: Optional[Tensor],
-        channel_dim: int,
-    ) -> Tensor:
-        if channel_dim < 0:
-            channel_dim += x.ndim
-        ctx.channel_dim = channel_dim
-        xgt0 = x > 0
-        if sign_factor is None:
-            ctx.save_for_backward(xgt0, scale_factor)
-        else:
-            ctx.save_for_backward(xgt0, scale_factor, sign_factor)
-        return x
-    @staticmethod
-    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
-        if len(ctx.saved_tensors) == 3:
-            xgt0, scale_factor, sign_factor = ctx.saved_tensors
-            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
-                scale_factor = scale_factor.unsqueeze(-1)
-                sign_factor = sign_factor.unsqueeze(-1)
-            factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
-        else:
-            xgt0, scale_factor = ctx.saved_tensors
-            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
-                scale_factor = scale_factor.unsqueeze(-1)
-            factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
-        neg_delta_grad = x_grad.abs() * factor
-        return (
-            x_grad - neg_delta_grad,
-            None,
-            None,
-            None,
-        )
-def _compute_scale_factor(
-    x: Tensor,
-    channel_dim: int,
-    min_abs: float,
-    max_abs: float,
-    gain_factor: float,
-    max_factor: float,
-) -> Tensor:
-    if channel_dim < 0:
-        channel_dim += x.ndim
-    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-    x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32)
-    if min_abs == 0.0:
-        below_threshold = 0.0
-    else:
-        # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if
-        # x_abs)_mean , min_abs.
-        below_threshold = (
-            (min_abs - x_abs_mean) * (gain_factor / min_abs)
-        ).clamp(min=0, max=max_factor)
-    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(
-        min=0, max=max_factor
-    )
-    return below_threshold - above_threshold
-def _compute_sign_factor(
-    x: Tensor,
-    channel_dim: int,
-    min_positive: float,
-    max_positive: float,
-    gain_factor: float,
-    max_factor: float,
-) -> Tensor:
-    if channel_dim < 0:
-        channel_dim += x.ndim
-    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-    proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims)
-    if min_positive == 0.0:
-        factor1 = 0.0
-    else:
-        # 0 if proportion_positive >= min_positive, else can be
-        # as large as max_factor.
-        factor1 = (
-            (min_positive - proportion_positive) * (gain_factor / min_positive)
-        ).clamp_(min=0, max=max_factor)
-    if max_positive == 1.0:
-        factor2 = 0.0
-    else:
-        # 0 if self.proportion_positive <= max_positive, else can be
-        # as large as -max_factor.
-        factor2 = (
-            (proportion_positive - max_positive)
-            * (gain_factor / (1.0 - max_positive))
-        ).clamp_(min=0, max=max_factor)
-    sign_factor = factor1 - factor2
-    # require min_positive != 0 or max_positive != 1:
-    assert not isinstance(sign_factor, float)
-    return sign_factor
-class ActivationScaleBalancerFunction(torch.autograd.Function):
-    """
-    This object is used in class ActivationBalancer when the user specified
-    min_positive=0, max_positive=1, so there are no constraints on the signs
-    of the activations and only the absolute value has a constraint.
-    """
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        sign_factor: Tensor,
-        scale_factor: Tensor,
-        channel_dim: int,
-    ) -> Tensor:
-        if channel_dim < 0:
-            channel_dim += x.ndim
-        ctx.channel_dim = channel_dim
-        xgt0 = x > 0
-        ctx.save_for_backward(xgt0, sign_factor, scale_factor)
-        return x
-    @staticmethod
-    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
-        xgt0, sign_factor, scale_factor = ctx.saved_tensors
-        for _ in range(ctx.channel_dim, x_grad.ndim - 1):
-            sign_factor = sign_factor.unsqueeze(-1)
-            scale_factor = scale_factor.unsqueeze(-1)
-        factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
-        neg_delta_grad = x_grad.abs() * factor
-        return (
-            x_grad - neg_delta_grad,
-            None,
-            None,
-            None,
-        )
-class RandomClampFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        min: Optional[float],
-        max: Optional[float],
-        prob: float,
-        reflect: float,
-    ) -> Tensor:
-        x_clamped = torch.clamp(x, min=min, max=max)
-        mask = torch.rand_like(x) < prob
-        ans = torch.where(mask, x_clamped, x)
-        if x.requires_grad:
-            ctx.save_for_backward(ans == x)
-            ctx.reflect = reflect
-        if reflect != 0.0:
-            ans = ans * (1.0 + reflect) - (x * reflect)
-        return ans
-    @staticmethod
-    def backward(
-        ctx, ans_grad: Tensor
-    ) -> Tuple[Tensor, None, None, None, None]:
-        (is_same,) = ctx.saved_tensors
-        x_grad = ans_grad * is_same.to(ans_grad.dtype)
-        reflect = ctx.reflect
-        if reflect != 0.0:
-            x_grad = x_grad * (1.0 + reflect) - (ans_grad * reflect)
-        return x_grad, None, None, None, None
-def random_clamp(
-    x: Tensor,
-    min: Optional[float] = None,
-    max: Optional[float] = None,
-    prob: float = 0.5,
-    reflect: float = 0.0,
-):
-    return RandomClampFunction.apply(x, min, max, prob, reflect)
-def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
-    """
-    A randomized way of casting a floating point value to half precision.
-    """
-    if x.dtype == torch.float16:
-        return x
-    x_abs = x.abs()
-    is_too_small = x_abs < min_abs
-    # for elements where is_too_small is true, random_val will contain +-min_abs with
-    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
-    # for those elements].
-    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
-    return torch.where(is_too_small, random_val, x).to(torch.float16)
-class RandomGradFunction(torch.autograd.Function):
-    """
-    Does nothing in forward pass; in backward pass, gets rid of very small grads using
-    randomized approach that preserves expectations (intended to reduce roundoff).
-    """
-    @staticmethod
-    def forward(ctx, x: Tensor, min_abs: float) -> Tensor:
-        ctx.min_abs = min_abs
-        return x
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor) -> Tuple[Tensor, None]:
-        if ans_grad.dtype == torch.float16:
-            return (
-                random_cast_to_half(
-                    ans_grad.to(torch.float32), min_abs=ctx.min_abs
-                ),
-                None,
-            )
-        else:
-            return ans_grad, None
-class RandomGrad(torch.nn.Module):
-    """
-    Gets rid of very small gradients using an expectation-preserving method, intended to increase
-    accuracy of training when using amp (automatic mixed precision)
-    """
-    def __init__(self, min_abs: float = 5.0e-06):
-        super(RandomGrad, self).__init__()
-        self.min_abs = min_abs
-    def forward(self, x: Tensor):
-        if (
-            torch.jit.is_scripting()
-            or not self.training
-            or torch.jit.is_tracing()
-        ):
-            return x
-        else:
-            return RandomGradFunction.apply(x, self.min_abs)
-class SoftmaxFunction(torch.autograd.Function):
-    """
-    Tries to handle half-precision derivatives in a randomized way that should
-    be more accurate for training than the default behavior.
-    """
-    @staticmethod
-    def forward(ctx, x: Tensor, dim: int):
-        ans = x.softmax(dim=dim)
-        # if x dtype is float16, x.softmax() returns a float32 because
-        # (presumably) that op does not support float16, and autocast
-        # is enabled.
-        if torch.is_autocast_enabled():
-            ans = ans.to(torch.float16)
-        ctx.save_for_backward(ans)
-        ctx.x_dtype = x.dtype
-        ctx.dim = dim
-        return ans
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        (ans,) = ctx.saved_tensors
-        with torch.cuda.amp.autocast(enabled=False):
-            ans_grad = ans_grad.to(torch.float32)
-            ans = ans.to(torch.float32)
-            x_grad = ans_grad * ans
-            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
-            return x_grad, None
-def softmax(x: Tensor, dim: int):
-    if torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x.softmax(dim)
-    return SoftmaxFunction.apply(x, dim)
-class MaxEigLimiterFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        coeffs: Tensor,
-        direction: Tensor,
-        channel_dim: int,
-        grad_scale: float,
-    ) -> Tensor:
-        ctx.channel_dim = channel_dim
-        ctx.grad_scale = grad_scale
-        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
-        return x
-    @staticmethod
-    def backward(ctx, x_grad, *args):
-        with torch.enable_grad():
-            (x_orig, coeffs, new_direction) = ctx.saved_tensors
-            x_orig.requires_grad = True
-            num_channels = x_orig.shape[ctx.channel_dim]
-            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
-            new_direction.requires_grad = False
-            x = x - x.mean(dim=0)
-            x_var = (x ** 2).mean()
-            x_residual = x - coeffs * new_direction
-            x_residual_var = (x_residual ** 2).mean()
-            # `variance_proportion` is the proportion of the variance accounted for
-            # by the top eigen-direction.  This is to be minimized.
-            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
-            variance_proportion.backward()
-        x_orig_grad = x_orig.grad
-        x_extra_grad = (
-            x_orig.grad
-            * ctx.grad_scale
-            * x_grad.norm()
-            / (x_orig_grad.norm() + 1.0e-20)
-        )
-        return x_grad + x_extra_grad.detach(), None, None, None, None
-class BasicNorm(torch.nn.Module):
-    """
-    This is intended to be a simpler, and hopefully cheaper, replacement for
-    LayerNorm.  The observation this is based on, is that Transformer-type
-    networks, especially with pre-norm, sometimes seem to set one of the
-    feature dimensions to a large constant value (e.g. 50), which "defeats"
-    the LayerNorm because the output magnitude is then not strongly dependent
-    on the other (useful) features.  Presumably the weight and bias of the
-    LayerNorm are required to allow it to do this.
-    So the idea is to introduce this large constant value as an explicit
-    parameter, that takes the role of the "eps" in LayerNorm, so the network
-    doesn't have to do this trick.  We make the "eps" learnable.
-    Args:
-       num_channels: the number of channels, e.g. 512.
-      channel_dim: the axis/dimension corresponding to the channel,
-        interprted as an offset from the input's ndim if negative.
-        shis is NOT the num_channels; it should typically be one of
-        {-2, -1, 0, 1, 2, 3}.
-       eps: the initial "epsilon" that we add as ballast in:
-             scale = ((input_vec**2).mean() + epsilon)**-0.5
-          Note: our epsilon is actually large, but we keep the name
-          to indicate the connection with conventional LayerNorm.
-       learn_eps: if true, we learn epsilon; if false, we keep it
-         at the initial value.
-    eps_min: float
-    eps_max: float
-    """
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-        eps: float = 0.25,
-        learn_eps: bool = True,
-        eps_min: float = -3.0,
-        eps_max: float = 3.0,
-    ) -> None:
-        super(BasicNorm, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        if learn_eps:
-            self.eps = nn.Parameter(torch.tensor(eps).log().detach())
-        else:
-            self.register_buffer("eps", torch.tensor(eps).log().detach())
-        self.eps_min = eps_min
-        self.eps_max = eps_max
-    def forward(self, x: Tensor) -> Tensor:
-        assert x.shape[self.channel_dim] == self.num_channels
-        eps = self.eps
-        if self.training and random.random() < 0.25:
-            # with probability 0.25, in training mode, clamp eps between the min
-            # and max; this will encourage it to learn parameters within the
-            # allowed range by making parameters that are outside the allowed
-            # range noisy.
-            # gradients to allow the parameter to get back into the allowed
-            # region if it happens to exit it.
-            eps = eps.clamp(min=self.eps_min, max=self.eps_max)
-        scales = (
-            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True) + eps.exp()
-        ) ** -0.5
-        return x * scales
-def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
-    """
-    Behaves like a constructor of a modified version of nn.Linear
-    that gives an easy way to set the default initial parameter scale.
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Linear(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(
-                ans.bias, -0.1 * initial_scale, 0.1 * initial_scale
-            )
-    return ans
-def ScaledConv1d(
-    *args,
-    initial_scale: float = 1.0,
-    kernel_size: int = 3,
-    padding: str = "same",
-    **kwargs,
-) -> nn.Conv1d:
-    """
-    Behaves like a constructor of a modified version of nn.Conv1d
-    that gives an easy way to set the default initial parameter scale.
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(
-                ans.bias, -0.1 * initial_scale, 0.1 * initial_scale
-            )
-    return ans
-def TransposeScaledConv1d(
-    *args,
-    initial_scale: float = 1.0,
-    kernel_size: int = 3,
-    padding: str = "same",
-    **kwargs,
-) -> nn.Sequential:
-    """
-    Transpose -> ScaledConv1d
-    """
-    return nn.Sequential(
-        Transpose(),
-        ScaledConv1d(
-            *args,
-            initial_scale=initial_scale,
-            kernel_size=kernel_size,
-            padding=padding,
-            **kwargs,
-        ),
-    )
-def ScaledConv1dTranspose(
-    *args,
-    initial_scale: float = 1.0,
-    kernel_size: int = 3,
-    padding: str = "same",
-    **kwargs,
-) -> nn.Sequential:
-    """
-    Transpose -> ScaledConv1d
-    """
-    return nn.Sequential(
-        ScaledConv1d(
-            *args,
-            initial_scale=initial_scale,
-            kernel_size=kernel_size,
-            padding=padding,
-            **kwargs,
-        ),
-        Transpose(),
-    )
-def TransposeConv1d(
-    *args, kernel_size: int = 3, padding: str = "same", **kwargs
-) -> nn.Sequential:
-    """
-    Transpose -> Conv1d
-    """
-    return nn.Sequential(
-        Transpose(),
-        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
-    )
-def Conv1dTranspose(
-    *args, kernel_size: int = 3, padding: str = "same", **kwargs
-) -> nn.Sequential:
-    """
-    ScaledConv1d -> Transpose
-    """
-    return nn.Sequential(
-        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
-        Transpose(),
-    )
-class SRLinear(nn.Linear):
-    """https://arxiv.org/abs/2303.06296
-    Stabilizing Transformer Training by Preventing Attention Entropy Collapse
-    """
-    def __init__(self, in_features, out_features, bias=True, **kwargs):
-        super().__init__(in_features, out_features, bias=bias, **kwargs)
-        self.register_buffer(
-            "u", nn.functional.normalize(torch.randn(in_features), dim=0)
-        )
-        with torch.no_grad():
-            sigma = self.get_sigma()
-        self.register_buffer("spectral_norm", sigma)
-        self.sigma = nn.Parameter(torch.ones(1))
-    def get_sigma(self):
-        with torch.no_grad():
-            u = self.u
-            v = self.weight.mv(u)
-            v = nn.functional.normalize(v, dim=0)
-            u = self.weight.T.mv(v)
-            u = nn.functional.normalize(u, dim=0)
-            self.u.data.copy_(u)
-        return torch.einsum("c,cd,d->", v, self.weight, u)
-    def get_weight(self):
-        sigma = self.get_sigma()
-        if self.training:
-            self.spectral_norm.data.copy_(sigma)
-        weight = (self.sigma / sigma) * self.weight
-        return weight
-    def forward(self, x):
-        return nn.functional.linear(x, self.get_weight(), self.bias)
-class SRConv1d(SRLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        kernel_size,
-        stride: int = 1,
-        padding: str = "same",
-        bias: bool = True,
-        **kwargs,
-    ):
-        in_features = in_features * kernel_size
-        super().__init__(in_features, out_features, bias=bias, **kwargs)
-        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-    def forward(self, x):
-        in_features = self.in_features // self.kernel_size
-        weight = self.get_weight().view(
-            self.out_features, in_features, self.kernel_size
-        )
-        return nn.functional.conv1d(
-            x, weight, bias=self.bias, stride=self.stride, padding=self.padding
-        )
-def TransposeSRConv1d(
-    *args, kernel_size: int = 3, padding: str = "same", **kwargs
-) -> nn.Sequential:
-    """
-    Transpose -> SRConv1d
-    """
-    return nn.Sequential(
-        Transpose(),
-        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
-    )
-def SRConv1dTranspose(
-    *args, kernel_size: int = 3, padding: str = "same", **kwargs
-) -> nn.Sequential:
-    """
-    SRConv1d -> Transpose
-    """
-    return nn.Sequential(
-        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
-        Transpose(),
-    )
-class ActivationBalancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-    Args:
-           num_channels: the number of channels
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           max_factor: the maximum factor by which we modify the derivatives for
-              either the sign constraint or the magnitude constraint;
-              e.g. with max_factor=0.02, the the derivatives would be multiplied by
-              values in the range [0.98..1.02].
-           sign_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_positive and max_positive
-              are violated.
-           scale_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_abs and max_abs
-              are violated.
-           min_abs:  the minimum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-           max_abs:  the maximum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-          min_prob: determines the minimum probability with which we modify the
-             gradients for the {min,max}_positive and {min,max}_abs constraints,
-             on each forward().  This is done randomly to prevent all layers
-             from doing it at the same time.  Early in training we may use
-             higher probabilities than this; it will decay to this value.
-    """
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int,
-        min_positive: float = 0.05,
-        max_positive: float = 0.95,
-        max_factor: float = 0.04,
-        sign_gain_factor: float = 0.01,
-        scale_gain_factor: float = 0.02,
-        min_abs: float = 0.2,
-        max_abs: float = 100.0,
-        min_prob: float = 0.1,
-    ):
-        super(ActivationBalancer, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.max_factor = max_factor
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-        self.min_prob = min_prob
-        self.sign_gain_factor = sign_gain_factor
-        self.scale_gain_factor = scale_gain_factor
-        # count measures how many times the forward() function has been called.
-        # We occasionally sync this to a tensor called `count`, that exists to
-        # make sure it is synced to disk when we load and save the model.
-        self.cpu_count = 0
-        self.register_buffer("count", torch.tensor(0, dtype=torch.int64))
-    def forward(self, x: Tensor) -> Tensor:
-        if (
-            torch.jit.is_scripting()
-            or not x.requires_grad
-            or torch.jit.is_tracing()
-        ):
-            return _no_op(x)
-        count = self.cpu_count
-        self.cpu_count += 1
-        if random.random() < 0.01:
-            # Occasionally sync self.cpu_count with self.count.
-            # count affects the decay of 'prob'.  don't do this on every iter,
-            # because syncing with the GPU is slow.
-            self.cpu_count = max(self.cpu_count, self.count.item())
-            self.count.fill_(self.cpu_count)
-        # the prob of doing some work exponentially decreases from 0.5 till it hits
-        # a floor at min_prob (==0.1, by default)
-        prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0)))
-        if random.random() < prob:
-            sign_gain_factor = 0.5
-            if self.min_positive != 0.0 or self.max_positive != 1.0:
-                sign_factor = _compute_sign_factor(
-                    x,
-                    self.channel_dim,
-                    self.min_positive,
-                    self.max_positive,
-                    gain_factor=self.sign_gain_factor / prob,
-                    max_factor=self.max_factor,
-                )
-            else:
-                sign_factor = None
-            scale_factor = _compute_scale_factor(
-                x.detach(),
-                self.channel_dim,
-                min_abs=self.min_abs,
-                max_abs=self.max_abs,
-                gain_factor=self.scale_gain_factor / prob,
-                max_factor=self.max_factor,
-            )
-            return ActivationBalancerFunction.apply(
-                x,
-                scale_factor,
-                sign_factor,
-                self.channel_dim,
-            )
-        else:
-            return _no_op(x)
-def penalize_abs_values_gt(x: Tensor, limit: float, penalty: float) -> Tensor:
-    """
-    Returns x unmodified, but in backprop will put a penalty for the excess of
-    the absolute values of elements of x over the limit "limit".  E.g. if
-    limit == 10.0, then if x has any values over 10 it will get a penalty.
-    Caution: the value of this penalty will be affected by grad scaling used
-    in automatic mixed precision training.  For this reasons we use this,
-    it shouldn't really matter, or may even be helpful; we just use this
-    to disallow really implausible values of scores to be given to softmax.
-    """
-    x_sign = x.sign()
-    over_limit = (x.abs() - limit) > 0
-    # The following is a memory efficient way to penalize the absolute values of
-    # x that's over the limit.  (The memory efficiency comes when you think
-    # about which items torch needs to cache for the autograd, and which ones it
-    # can throw away).  The numerical value of aux_loss as computed here will
-    # actually be larger than it should be, by limit * over_limit.sum(), but it
-    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
-    # limit).relu().
-    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
-    # note: we don't do sum() here on aux)_loss, but it's as if we had done
-    # sum() due to how with_loss() works.
-    x = with_loss(x, aux_loss)
-    # you must use x for something, or this will be ineffective.
-    return x
-def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
-    if x.ndim == 2:
-        return x.diag()
-    else:
-        (batch, dim, dim) = x.shape
-        x = x.reshape(batch, dim * dim)
-        x = x[:, :: dim + 1]
-        assert x.shape == (batch, dim)
-        return x
-def _whitening_metric(x: Tensor, num_groups: int):
-    """
-    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
-    of the centered feature covariance are the same within each group's covariance matrix
-    and also between groups.
-    Args:
-        x: a Tensor of shape (*, num_channels)
-     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
-    Returns:
-        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
-    greater than 1.0 otherwise.
-    """
-    assert x.dtype != torch.float16
-    x = x.reshape(-1, x.shape[-1])
-    (num_frames, num_channels) = x.shape
-    assert num_channels % num_groups == 0
-    channels_per_group = num_channels // num_groups
-    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
-    # x now has shape (num_groups, num_frames, channels_per_group)
-    # subtract the mean so we use the centered, not uncentered, covariance.
-    # My experience has been that when we "mess with the gradients" like this,
-    # it's better not do anything that tries to move the mean around, because
-    # that can easily cause instability.
-    x = x - x.mean(dim=1, keepdim=True)
-    # x_covar: (num_groups, channels_per_group, channels_per_group)
-    x_covar = torch.matmul(x.transpose(1, 2), x)
-    x_covar_mean_diag = _diag(x_covar).mean()
-    # the following expression is what we'd get if we took the matrix product
-    # of each covariance and measured the mean of its trace, i.e.
-    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
-    x_covarsq_mean_diag = (x_covar ** 2).sum() / (
-        num_groups * channels_per_group
-    )
-    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
-    metric = x_covarsq_mean_diag / (x_covar_mean_diag ** 2 + 1.0e-20)
-    return metric
-class WhiteningPenaltyFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        num_groups: int,
-        whitening_limit: float,
-        grad_scale: float,
-    ) -> Tensor:
-        ctx.save_for_backward(x)
-        ctx.num_groups = num_groups
-        ctx.whitening_limit = whitening_limit
-        ctx.grad_scale = grad_scale
-        return x
-    @staticmethod
-    def backward(ctx, x_grad: Tensor):
-        (x_orig,) = ctx.saved_tensors
-        with torch.enable_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                x_detached = x_orig.to(torch.float32).detach()
-                x_detached.requires_grad = True
-                metric = _whitening_metric(x_detached, ctx.num_groups)
-                if random.random() < 0.005 or __name__ == "__main__":
-                    logging.info(
-                        f"Whitening: num_groups={ctx.num_groups}, num_channels={x_orig.shape[-1]}, "
-                        f"metric={metric.item():.2f} vs. limit={ctx.whitening_limit}"
-                    )
-                (metric - ctx.whitening_limit).relu().backward()
-                penalty_grad = x_detached.grad
-                scale = ctx.grad_scale * (
-                    x_grad.to(torch.float32).norm()
-                    / (penalty_grad.norm() + 1.0e-20)
-                )
-                penalty_grad = penalty_grad * scale
-        return x_grad + penalty_grad.to(x_grad.dtype), None, None, None
-class Whiten(nn.Module):
-    def __init__(
-        self,
-        num_groups: int,
-        whitening_limit: float,
-        prob: Union[float, Tuple[float, float]],
-        grad_scale: float,
-    ):
-        """
-        Args:
-          num_groups: the number of groups to divide the channel dim into before
-            whitening.  We will attempt to make the feature covariance
-            within each group, after mean subtraction, as "white" as possible,
-            while having the same trace across all groups.
-         whitening_limit: a value greater than 1.0, that dictates how much
-           freedom we have to violate the constraints.  1.0 would mean perfectly
-           white, with exactly the same trace across groups; larger values
-           give more freedom.  E.g. 2.0.
-         prob: the probability with which we apply the gradient modification
-           (also affects the grad scale).  May be supplied as a float,
-           or as a pair (min_prob, max_prob)
-          grad_scale: determines the scale on the gradient term from this object,
-            relative to the rest of the gradient on the attention weights.
-            E.g. 0.02 (you may want to use smaller values than this if prob is large)
-        """
-        super(Whiten, self).__init__()
-        assert num_groups >= 1
-        assert whitening_limit >= 1
-        assert grad_scale >= 0
-        self.num_groups = num_groups
-        self.whitening_limit = whitening_limit
-        if isinstance(prob, float):
-            assert 0 < prob <= 1
-            self.prob = prob
-        else:
-            (self.min_prob, self.max_prob) = prob
-            assert 0 < self.min_prob < self.max_prob <= 1
-            self.prob = self.max_prob
-        self.grad_scale = grad_scale
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        In the forward pass, this function just returns the input unmodified.
-        In the backward pass, it will modify the gradients to ensure that the
-        distribution in each group has close to (lambda times I) as the covariance
-        after mean subtraction, with the same lambda across groups.
-        For whitening_limit > 1, there will be more freedom to violate this
-        constraint.
-        Args:
-           x: the input of shape (*, num_channels)
-        Returns:
-            x, unmodified.   You should make sure
-        you use the returned value, or the graph will be freed
-        and nothing will happen in backprop.
-        """
-        if (
-            not x.requires_grad
-            or random.random() > self.prob
-            or self.grad_scale == 0
-        ):
-            return _no_op(x)
-        else:
-            if hasattr(self, "min_prob") and random.random() < 0.25:
-                # occasionally switch between min_prob and max_prob, based on whether
-                # we are above or below the threshold.
-                if (
-                    _whitening_metric(x.to(torch.float32), self.num_groups)
-                    > self.whitening_limit
-                ):
-                    # there would be a change to the grad.
-                    self.prob = self.max_prob
-                else:
-                    self.prob = self.min_prob
-            return WhiteningPenaltyFunction.apply(
-                x, self.num_groups, self.whitening_limit, self.grad_scale
-            )
-class WithLoss(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, y: Tensor):
-        ctx.y_shape = y.shape
-        return x
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        return ans_grad, torch.ones(
-            ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device
-        )
-def with_loss(x, y):
-    if torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x
-    # returns x but adds y.sum() to the loss function.
-    return WithLoss.apply(x, y)
-def _no_op(x: Tensor) -> Tensor:
-    if torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x
-    else:
-        # a no-op function that will have a node in the autograd graph,
-        # to avoid certain bugs relating to backward hooks
-        return x.chunk(1, dim=-1)[0]
-class Identity(torch.nn.Module):
-    def __init__(self):
-        super(Identity, self).__init__()
-    def forward(self, x):
-        return _no_op(x)
-class MaxEig(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to discourage
-    that any given direction in activation space accounts for more than
-    a specified proportion of the covariance (e.g. 0.2).
-    Args:
-           num_channels: the number of channels
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           max_var_per_eig:  the maximum proportion of the variance of the
-               features/channels, after mean subtraction, that can come from
-               any given eigenvalue.
-           min_prob: the minimum probability with which we apply this during any invocation
-               of forward(), assuming last time we applied the constraint it was
-               not active; supplied for speed.
-           scale: determines the scale with which we modify the gradients, relative
-               to the existing / unmodified gradients
-    """
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int,
-        max_var_per_eig: float = 0.2,
-        min_prob: float = 0.01,
-        scale: float = 0.01,
-    ):
-        super(MaxEig, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.scale = scale
-        assert max_var_per_eig == 0.0 or max_var_per_eig > 1.0 / num_channels
-        self.max_var_per_eig = max_var_per_eig
-        # we figure out the dominant direction using the power method: starting with
-        # a random vector, keep multiplying by the covariance and renormalizing.
-        with torch.no_grad():
-            # arbitrary.. would use randn() but want to leave the rest of the model's
-            # random parameters unchanged for comparison
-            direction = torch.arange(num_channels).to(torch.float)
-            direction = direction / direction.norm()
-            self.register_buffer("max_eig_direction", direction)
-        self.min_prob = min_prob
-        # cur_prob is the current probability we'll use to apply the ActivationBalancer.
-        # We'll regress this towards prob, each time we try to apply it and it is not
-        # active.
-        self.cur_prob = 1.0
-    def forward(self, x: Tensor) -> Tensor:
-        if (
-            torch.jit.is_scripting()
-            or self.max_var_per_eig <= 0
-            or random.random() > self.cur_prob
-            or torch.jit.is_tracing()
-        ):
-            return _no_op(x)
-        with torch.cuda.amp.autocast(enabled=False):
-            eps = 1.0e-20
-            orig_x = x
-            x = x.to(torch.float32)
-            with torch.no_grad():
-                x = x.transpose(self.channel_dim, -1).reshape(
-                    -1, self.num_channels
-                )
-                x = x - x.mean(dim=0)
-                new_direction, coeffs = self._find_direction_coeffs(
-                    x, self.max_eig_direction
-                )
-                x_var = (x ** 2).mean()
-                x_residual = x - coeffs * new_direction
-                x_residual_var = (x_residual ** 2).mean()
-                # `variance_proportion` is the proportion of the variance accounted for
-                # by the top eigen-direction.
-                variance_proportion = (x_var - x_residual_var) / (
-                    x_var + 1.0e-20
-                )
-                # ensure new direction is nonzero even if x == 0, by including `direction`.
-                self._set_direction(
-                    0.1 * self.max_eig_direction + new_direction
-                )
-            if random.random() < 0.01 or __name__ == "__main__":
-                logging.info(
-                    f"variance_proportion = {variance_proportion.item()}, shape={tuple(orig_x.shape)}, cur_prob={self.cur_prob}"
-                )
-            if variance_proportion >= self.max_var_per_eig:
-                # The constraint is active.  Note, we should quite rarely
-                # reach here, only near the beginning of training if we are
-                # starting to diverge, should this constraint be active.
-                cur_prob = self.cur_prob
-                self.cur_prob = (
-                    1.0  # next time, do the update with probability 1.0.
-                )
-                return MaxEigLimiterFunction.apply(
-                    orig_x, coeffs, new_direction, self.channel_dim, self.scale
-                )
-            else:
-                # let self.cur_prob exponentially approach self.min_prob, as
-                # long as the constraint is inactive.
-                self.cur_prob = 0.75 * self.cur_prob + 0.25 * self.min_prob
-                return orig_x
-    def _set_direction(self, direction: Tensor):
-        """
-        Sets self.max_eig_direction to a normalized version of `direction`
-        """
-        direction = direction.detach()
-        direction = direction / direction.norm()
-        direction_sum = direction.sum().item()
-        if direction_sum - direction_sum == 0:  # no inf/nan
-            self.max_eig_direction[:] = direction
-        else:
-            logging.info(
-                f"Warning: sum of direction in MaxEig is {direction_sum}, "
-                "num_channels={self.num_channels}, channel_dim={self.channel_dim}"
-            )
-    def _find_direction_coeffs(
-        self, x: Tensor, prev_direction: Tensor
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        """
-            Figure out (an approximation to) the proportion of the variance of a set of
-            feature vectors that can be attributed to the top eigen-direction.
-            Args:
-             x: a Tensor of shape (num_frames, num_channels), with num_frames > 1.
-          prev_direction:  a Tensor of shape (num_channels,), that is our previous estimate
-                   of the top eigen-direction, or a random direction if this is the first
-                   iteration.  Does not have to be normalized, but should be nonzero.
-        Returns: (cur_direction, coeffs), where:
-             cur_direction: a Tensor of shape (num_channels,) that is the current
-                estimate of the top eigen-direction.
-             coeffs: a Tensor of shape (num_frames, 1) that minimizes, or
-                approximately minimizes, (x - coeffs * cur_direction).norm()
-        """
-        (num_frames, num_channels) = x.shape
-        assert num_channels > 1 and num_frames > 1
-        assert prev_direction.shape == (num_channels,)
-        # `coeffs` are the coefficients of `prev_direction` in x.
-        # actually represent the coeffs up to a constant positive factor.
-        coeffs = (x * prev_direction).sum(dim=1, keepdim=True) + 1.0e-10
-        cur_direction = (x * coeffs).sum(dim=0) / (
-            (coeffs ** 2).sum() + 1.0e-20
-        )
-        return cur_direction, coeffs
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        x_dtype = x.dtype
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-        if requires_grad:
-            deriv = y * (1 - s) + s
-            # notes on derivative of x * sigmoid(x - 1):
-            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
-            # min \simeq -0.043638.  Take floor as -0.043637 so it's a lower bund
-            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
-            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
-            # floors), should be expectation-preserving.
-            floor = -0.043637
-            ceil = 1.2
-            d_scaled = (deriv - floor) * (
-                255.0 / (ceil - floor)
-            ) + torch.rand_like(deriv)
-            if __name__ == "__main__":
-                # for self-testing only.
-                assert d_scaled.min() >= 0.0
-                assert d_scaled.max() < 256.0
-            d_int = d_scaled.to(torch.uint8)
-            ctx.save_for_backward(d_int)
-        if x.dtype == torch.float16 or torch.is_autocast_enabled():
-            y = y.to(torch.float16)
-        return y
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.043637
-        ceil = 1.2
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-class DoubleSwish(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            return x * torch.sigmoid(x - 1.0)
-        return DoubleSwishFunction.apply(x)
-def BalancedDoubleSwish(
-    d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25
-) -> nn.Sequential:
-    """
-    ActivationBalancer -> DoubleSwish
-    """
-    balancer = ActivationBalancer(
-        d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob
-    )
-    return nn.Sequential(
-        balancer,
-        DoubleSwish(),
-    )
-def _test_max_eig():
-    for proportion in [0.1, 0.5, 10.0]:
-        logging.info(f"proportion = {proportion}")
-        x = torch.randn(100, 128)
-        direction = torch.randn(128)
-        coeffs = torch.randn(100, 1)
-        x += proportion * direction * coeffs
-        x.requires_grad = True
-        num_channels = 128
-        m = MaxEig(
-            num_channels, 1, 0.5, scale=0.1  # channel_dim  # max_var_per_eig
-        )  # grad_scale
-        for _ in range(4):
-            y = m(x)
-        y_grad = torch.randn_like(x)
-        y.backward(gradient=y_grad)
-        if proportion < 0.2:
-            assert torch.allclose(x.grad, y_grad, atol=1.0e-02)
-        elif proportion > 1.0:
-            assert not torch.allclose(x.grad, y_grad)
-def _test_whiten():
-    for proportion in [0.1, 0.5, 10.0]:
-        logging.info(f"_test_whiten(): proportion = {proportion}")
-        x = torch.randn(100, 128)
-        direction = torch.randn(128)
-        coeffs = torch.randn(100, 1)
-        x += proportion * direction * coeffs
-        x.requires_grad = True
-        num_channels = 128
-        m = Whiten(
-            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
-        )  # grad_scale
-        for _ in range(4):
-            y = m(x)
-        y_grad = torch.randn_like(x)
-        y.backward(gradient=y_grad)
-        if proportion < 0.2:
-            assert torch.allclose(x.grad, y_grad)
-        elif proportion > 1.0:
-            assert not torch.allclose(x.grad, y_grad)
-def _test_activation_balancer_sign():
-    probs = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = 1.0 * (
-        (2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0
-    )
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        probs.numel(),
-        channel_dim=0,
-        min_positive=0.05,
-        max_positive=0.95,
-        max_factor=0.2,
-        min_abs=0.0,
-    )
-    y_grad = torch.sign(torch.randn(probs.numel(), N))
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_sign: x = ", x)
-    print("_test_activation_balancer_sign: y grad = ", y_grad)
-    print("_test_activation_balancer_sign: x grad = ", x.grad)
-def _test_activation_balancer_magnitude():
-    magnitudes = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
-        -1
-    )
-    x = x.detach()
-    x.requires_grad = True
-    m = ActivationBalancer(
-        magnitudes.numel(),
-        channel_dim=0,
-        min_positive=0.0,
-        max_positive=1.0,
-        max_factor=0.2,
-        min_abs=0.2,
-        max_abs=0.8,
-        min_prob=1.0,
-    )
-    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_activation_balancer_magnitude: x = ", x)
-    print("_test_activation_balancer_magnitude: y grad = ", y_grad)
-    print("_test_activation_balancer_magnitude: x grad = ", x.grad)
-def _test_basic_norm():
-    num_channels = 128
-    m = BasicNorm(num_channels=num_channels, channel_dim=1)
-    x = torch.randn(500, num_channels)
-    y = m(x)
-    assert y.shape == x.shape
-    x_rms = (x ** 2).mean().sqrt()
-    y_rms = (y ** 2).mean().sqrt()
-    print("x rms = ", x_rms)
-    print("y rms = ", y_rms)
-    assert y_rms < x_rms
-    assert y_rms > 0.5 * x_rms
-def _test_double_swish_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = DoubleSwish()
-    tol = (1.2 - (-0.043637)) / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol)
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-def _test_softmax():
-    a = torch.randn(2, 10, dtype=torch.float64)
-    b = a.clone()
-    a.requires_grad = True
-    b.requires_grad = True
-    a.softmax(dim=1)[:, 0].sum().backward()
-    print("a grad = ", a.grad)
-    softmax(b, dim=1)[:, 0].sum().backward()
-    print("b grad = ", b.grad)
-    assert torch.allclose(a.grad, b.grad)
-if __name__ == "__main__":
-    logging.getLogger().setLevel(logging.INFO)
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    _test_softmax()
-    _test_whiten()
-    _test_max_eig()
-    _test_activation_balancer_sign()
-    _test_activation_balancer_magnitude()
-    _test_basic_norm()
-    _test_double_swish_deriv()

modules/scheduler.py DELETED Viewed

@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023                           (authors: Feiteng Li)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from modules.optim import Eden
-def calc_lr(step, dim_embed, warmup_steps):
-    return dim_embed ** (-0.5) * min(
-        step ** (-0.5), step * warmup_steps ** (-1.5)
-    )
-class NoamScheduler(torch.optim.lr_scheduler._LRScheduler):
-    def __init__(
-        self,
-        base_lr: float,
-        optimizer: torch.optim.Optimizer,
-        dim_embed: int,
-        warmup_steps: int,
-        last_epoch: int = -1,
-        verbose: bool = False,
-    ) -> None:
-        self.dim_embed = dim_embed
-        self.base_lr = base_lr
-        self.warmup_steps = warmup_steps
-        self.num_param_groups = len(optimizer.param_groups)
-        super().__init__(optimizer, last_epoch, verbose)
-    def get_lr(self) -> float:
-        lr = self.base_lr * calc_lr(
-            self._step_count, self.dim_embed, self.warmup_steps
-        )
-        return [lr] * self.num_param_groups
-    def set_step(self, step: int):
-        self._step_count = step
-def get_scheduler(params, optimizer):
-    if params.scheduler_name.lower() == "eden":
-        scheduler = Eden(optimizer, 5000, 4, warmup_batches=params.warmup_steps)
-    elif params.scheduler_name.lower() == "noam":
-        scheduler = NoamScheduler(
-            params.base_lr,
-            optimizer,
-            params.decoder_dim,
-            warmup_steps=params.warmup_steps,
-        )
-        # scheduler.set_step(params.start_batch or params.batch_idx_train)
-    elif params.scheduler_name.lower() == "cosine":
-        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            params.warmup_steps,
-            optimizer,
-            eta_min=params.base_lr,
-        )
-    else:
-        raise NotImplementedError(f"{params.scheduler_name}")
-    return scheduler

modules/transformer.py DELETED Viewed

@@ -1,683 +0,0 @@
-import copy
-import numbers
-from functools import partial
-from typing import Any, Callable, List, Optional, Tuple, Union
-import torch
-from torch import Tensor, nn
-from torch.nn import functional as F
-from .activation import MultiheadAttention
-from .scaling import ActivationBalancer, BalancedDoubleSwish
-from .scaling import BasicNorm as _BasicNorm
-_shape_t = Union[int, List[int], torch.Size]
-class LayerNorm(nn.Module):
-    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
-    normalized_shape: Tuple[int, ...]
-    eps: float
-    elementwise_affine: bool
-    def __init__(
-        self,
-        normalized_shape: _shape_t,
-        eps: float = 1e-5,
-        elementwise_affine: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(LayerNorm, self).__init__()
-        if isinstance(normalized_shape, numbers.Integral):
-            # mypy error: incompatible types in assignment
-            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
-        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
-        self.eps = eps
-        self.elementwise_affine = elementwise_affine
-        if self.elementwise_affine:
-            self.weight = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
-            self.bias = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
-        else:
-            self.register_parameter("weight", None)
-            self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self) -> None:
-        if self.elementwise_affine:
-            nn.init.ones_(self.weight)
-            nn.init.zeros_(self.bias)
-    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            return (
-                F.layer_norm(
-                    input,
-                    self.normalized_shape,
-                    self.weight,
-                    self.bias,
-                    self.eps,
-                ),
-                embedding,
-            )
-        assert embedding is None
-        return F.layer_norm(
-            input, self.normalized_shape, self.weight, self.bias, self.eps
-        )
-    def extra_repr(self) -> str:
-        return (
-            "{normalized_shape}, eps={eps}, "
-            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
-        )
-class AdaptiveLayerNorm(nn.Module):
-    r"""Adaptive Layer Normalization"""
-    def __init__(self, d_model, norm) -> None:
-        super(AdaptiveLayerNorm, self).__init__()
-        self.project_layer = nn.Linear(d_model, 2 * d_model)
-        self.norm = norm
-        self.d_model = d_model
-        self.eps = self.norm.eps
-    def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            weight, bias = torch.split(
-                self.project_layer(embedding),
-                split_size_or_sections=self.d_model,
-                dim=-1,
-            )
-            return (weight * self.norm(input) + bias, embedding)
-        weight, bias = torch.split(
-            self.project_layer(embedding),
-            split_size_or_sections=self.d_model,
-            dim=-1,
-        )
-        return weight * self.norm(input) + bias
-class BasicNorm(_BasicNorm):
-    def __init__(
-        self,
-        d_model: int,
-        eps: float = 1e-5,
-        device=None,
-        dtype=None,
-    ):
-        super(BasicNorm, self).__init__(d_model, eps=eps)
-    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            return (
-                super(BasicNorm, self).forward(input),
-                embedding,
-            )
-        assert embedding is None
-        return super(BasicNorm, self).forward(input)
-class BalancedBasicNorm(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        eps: float = 1e-5,
-        device=None,
-        dtype=None,
-    ):
-        super(BalancedBasicNorm, self).__init__()
-        self.balancer = ActivationBalancer(
-            d_model,
-            channel_dim=-1,
-            min_positive=0.45,
-            max_positive=0.55,
-            max_abs=6.0,
-        )
-        self.norm = BasicNorm(d_model, eps, device=device, dtype=dtype)
-    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            return self.norm((self.balancer(input), embedding))
-        assert embedding is None
-        return self.norm(self.balancer(input))
-class IdentityNorm(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        eps: float = 1e-5,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super(IdentityNorm, self).__init__()
-    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
-        if isinstance(input, tuple):
-            return input
-        assert embedding is None
-        return input
-class TransformerEncoderLayer(nn.Module):
-    __constants__ = ["batch_first", "norm_first"]
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
-        batch_first: bool = False,
-        norm_first: bool = False,
-        device=None,
-        dtype=None,
-        linear1_self_attention_cls: nn.Module = nn.Linear,
-        linear2_self_attention_cls: nn.Module = nn.Linear,
-        linear1_feedforward_cls: nn.Module = nn.Linear,
-        linear2_feedforward_cls: nn.Module = nn.Linear,
-        layer_norm_cls: nn.Module = LayerNorm,
-        layer_norm_eps: float = 1e-5,
-        adaptive_layer_norm=False,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_attn = MultiheadAttention(
-            d_model,
-            nhead,
-            dropout=dropout,
-            batch_first=batch_first,
-            linear1_cls=linear1_self_attention_cls,
-            linear2_cls=linear2_self_attention_cls,
-            **factory_kwargs,
-        )
-        # Implementation of Feedforward model
-        self.linear1 = linear1_feedforward_cls(
-            d_model, dim_feedforward, **factory_kwargs
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = linear2_feedforward_cls(
-            dim_feedforward, d_model, **factory_kwargs
-        )
-        self.norm_first = norm_first
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        # Legacy string support for activation function.
-        if isinstance(activation, str):
-            activation = _get_activation_fn(activation)
-        elif isinstance(activation, partial):
-            activation = activation(d_model)
-        elif activation == BalancedDoubleSwish:
-            activation = BalancedDoubleSwish(d_model)
-        # # We can't test self.activation in forward() in TorchScript,
-        # # so stash some information about it instead.
-        # if activation is F.relu or isinstance(activation, torch.nn.ReLU):
-        #     self.activation_relu_or_gelu = 1
-        # elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
-        #     self.activation_relu_or_gelu = 2
-        # else:
-        #     self.activation_relu_or_gelu = 0
-        self.activation = activation
-        norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
-        if layer_norm_cls == IdentityNorm:
-            norm2 = BalancedBasicNorm(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-        else:
-            norm2 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-        if adaptive_layer_norm:
-            self.norm1 = AdaptiveLayerNorm(d_model, norm1)
-            self.norm2 = AdaptiveLayerNorm(d_model, norm2)
-        else:
-            self.norm1 = norm1
-            self.norm2 = norm2
-    def __setstate__(self, state):
-        super(TransformerEncoderLayer, self).__setstate__(state)
-        if not hasattr(self, "activation"):
-            self.activation = F.relu
-    def forward(
-        self,
-        src: Tensor,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        r"""Pass the input through the encoder layer.
-        Args:
-            src: the sequence to the encoder layer (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        x, stage_embedding = src, None
-        is_src_tuple = False
-        if isinstance(src, tuple):
-            x, stage_embedding = src
-            is_src_tuple = True
-        if src_key_padding_mask is not None:
-            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(
-                src_key_padding_mask
-            ):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
-        if self.norm_first:
-            x = x + self._sa_block(
-                self.norm1(x, stage_embedding),
-                src_mask,
-                src_key_padding_mask,
-            )
-            x = x + self._ff_block(self.norm2(x, stage_embedding))
-        else:
-            x = self.norm1(
-                x + self._sa_block(x, src_mask, src_key_padding_mask),
-                stage_embedding,
-            )
-            x = self.norm2(x + self._ff_block(x), stage_embedding)
-        if is_src_tuple:
-            return (x, stage_embedding)
-        return x
-    def infer(
-        self,
-        src: Tensor,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-        past_kv: Optional[Tensor] = None,
-        use_cache: bool = False,
-    ):
-        x, stage_embedding = src, None
-        is_src_tuple = False
-        if isinstance(src, tuple):
-            x, stage_embedding = src
-            is_src_tuple = True
-        if src_key_padding_mask is not None:
-            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(
-                src_key_padding_mask
-            ):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
-        if self.norm_first:
-            x_attn_out, kv = self.self_attn.infer(
-                self.norm1(x, stage_embedding),
-                attn_mask=src_mask,
-                key_padding_mask=src_key_padding_mask,
-                need_weights=False,
-                past_kv=past_kv,
-                use_cache=use_cache,
-            )
-            x = x + x_attn_out
-            x = x + self._ff_block(self.norm2(x, stage_embedding))
-        if is_src_tuple:
-            return (x, stage_embedding)
-        return (x, kv)
-    # self-attention block
-    def _sa_block(
-        self,
-        x: Tensor,
-        attn_mask: Optional[Tensor],
-        key_padding_mask: Optional[Tensor],
-    ) -> Tensor:
-        x = self.self_attn(
-            x,
-            x,
-            x,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask,
-            need_weights=False,
-        )[0]
-        return self.dropout1(x)
-    # feed forward block
-    def _ff_block(self, x: Tensor) -> Tensor:
-        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
-        return self.dropout2(x)
-class TransformerEncoder(nn.Module):
-    r"""TransformerEncoder is a stack of N encoder layers. Users can build the
-    BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
-    Args:
-        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
-        num_layers: the number of sub-encoder-layers in the encoder (required).
-        norm: the layer normalization component (optional).
-        enable_nested_tensor: if True, input will automatically convert to nested tensor
-            (and convert back on output). This will improve the overall performance of
-            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
-    Examples::
-        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
-        >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
-        >>> src = torch.rand(10, 32, 512)
-        >>> out = transformer_encoder(src)
-    """
-    __constants__ = ["norm"]
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-    def forward(
-        self,
-        src: Tensor,
-        mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-        return_layer_states: bool = False,
-    ) -> Tensor:
-        r"""Pass the input through the encoder layers in turn.
-        Args:
-            src: the sequence to the encoder (required).
-            mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-            return_layer_states: return layers' state (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        if return_layer_states:
-            layer_states = []  # layers' output
-            output = src
-            for mod in self.layers:
-                output = mod(
-                    output,
-                    src_mask=mask,
-                    src_key_padding_mask=src_key_padding_mask,
-                )
-                layer_states.append(output[0])
-            if self.norm is not None:
-                output = self.norm(output)
-            return layer_states, output
-        output = src
-        for mod in self.layers:
-            output = mod(
-                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
-            )
-        if self.norm is not None:
-            output = self.norm(output)
-        return output
-    def infer(
-        self,
-        src: Tensor,
-        mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-        return_layer_states: bool = False,
-        past_kv: Optional[Tensor] = None,
-        use_cache: bool = False,
-    ):
-        if past_kv is None:
-            past_length = 0
-            past_kv = tuple([None] * self.num_layers)
-        else:
-            past_length = past_kv[0][0].size(-2)
-        new_kv = () if use_cache else None
-        output = src
-        for mod, past_layer_kv in zip(self.layers, past_kv):
-            output, kv = mod.infer(
-                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, past_kv=past_layer_kv, use_cache=use_cache
-            )
-            if use_cache:
-                new_kv = new_kv + (kv,)
-        if self.norm is not None:
-            output = self.norm(output)
-        return output, new_kv
-class TransformerDecoderLayer(nn.Module):
-    __constants__ = ["batch_first", "norm_first"]
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
-        linear1_self_attention_cls: nn.Module = nn.Linear,
-        linear2_self_attention_cls: nn.Module = nn.Linear,
-        linear1_feedforward_cls: nn.Module = nn.Linear,
-        linear2_feedforward_cls: nn.Module = nn.Linear,
-        batch_first: bool = False,
-        norm_first: bool = False,
-        device=None,
-        dtype=None,
-        layer_norm_cls: nn.Module = LayerNorm,
-        layer_norm_eps: float = 1e-5,
-        adaptive_layer_norm=False,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_attn = MultiheadAttention(
-            d_model,
-            nhead,
-            dropout=dropout,
-            batch_first=batch_first,
-            linear1_cls=linear1_self_attention_cls,
-            linear2_cls=linear2_self_attention_cls,
-            **factory_kwargs,
-        )
-        self.multihead_attn = MultiheadAttention(
-            d_model,
-            nhead,
-            dropout=dropout,
-            batch_first=batch_first,
-            linear1_cls=linear1_self_attention_cls,
-            linear2_cls=linear2_self_attention_cls,
-            **factory_kwargs,
-        )
-        # Implementation of Feedforward model
-        self.linear1 = linear1_feedforward_cls(
-            d_model, dim_feedforward, **factory_kwargs
-        )
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = linear2_feedforward_cls(
-            dim_feedforward, d_model, **factory_kwargs
-        )
-        self.norm_first = norm_first
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-        # Legacy string support for activation function.
-        if isinstance(activation, str):
-            self.activation = _get_activation_fn(activation)
-        elif isinstance(activation, partial):
-            self.activation = activation(d_model)
-        elif activation == BalancedDoubleSwish:
-            self.activation = BalancedDoubleSwish(d_model)
-        else:
-            self.activation = activation
-        if adaptive_layer_norm:
-            norm1 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-            norm2 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-            norm3 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-            self.norm1 = AdaptiveLayerNorm(d_model, norm1)
-            self.norm2 = AdaptiveLayerNorm(d_model, norm2)
-            self.norm3 = AdaptiveLayerNorm(d_model, norm3)
-        else:
-            self.norm1 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-            self.norm2 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs
-            )
-            if layer_norm_cls == IdentityNorm:
-                self.norm3 = BalancedBasicNorm(
-                    d_model, eps=layer_norm_eps, **factory_kwargs
-                )
-            else:
-                self.norm3 = layer_norm_cls(
-                    d_model, eps=layer_norm_eps, **factory_kwargs
-                )
-    def forward(
-        self,
-        tgt: Tensor,
-        memory: Tensor,
-        tgt_mask: Optional[Tensor] = None,
-        memory_mask: Optional[Tensor] = None,
-        tgt_key_padding_mask: Optional[Tensor] = None,
-        memory_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        r"""Pass the inputs (and mask) through the decoder layer.
-        Args:
-            tgt: the sequence to the decoder layer (required).
-            memory: the sequence from the last layer of the encoder (required).
-            tgt_mask: the mask for the tgt sequence (optional).
-            memory_mask: the mask for the memory sequence (optional).
-            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
-            memory_key_padding_mask: the mask for the memory keys per batch (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        tgt_is_tuple = False
-        if isinstance(tgt, tuple):
-            x, stage_embedding = tgt
-            tgt_is_tuple = True
-        else:
-            x, stage_embedding = tgt, None
-        if self.norm_first:
-            x = x + self._sa_block(
-                self.norm1(x, stage_embedding), tgt_mask, tgt_key_padding_mask
-            )
-            x = x + self._mha_block(
-                self.norm2(x, stage_embedding),
-                memory,
-                memory_mask,
-                memory_key_padding_mask,
-            )
-            x = x + self._ff_block(self.norm3(x, stage_embedding))
-        else:
-            x = self.norm1(
-                x + self._sa_block(x, tgt_mask, tgt_key_padding_mask),
-                stage_embedding,
-            )
-            x = self.norm2(
-                x
-                + self._mha_block(
-                    x, memory, memory_mask, memory_key_padding_mask
-                ),
-                stage_embedding,
-            )
-            x = self.norm3(x + self._ff_block(x), stage_embedding)
-        if tgt_is_tuple:
-            return (x, stage_embedding)
-        return x
-    # self-attention block
-    def _sa_block(
-        self,
-        x: Tensor,
-        attn_mask: Optional[Tensor],
-        key_padding_mask: Optional[Tensor],
-    ) -> Tensor:
-        x = self.self_attn(
-            x,
-            x,
-            x,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask,
-            need_weights=False,
-        )[0]
-        return self.dropout1(x)
-    # multihead attention block
-    def _mha_block(
-        self,
-        x: Tensor,
-        mem: Tensor,
-        attn_mask: Optional[Tensor],
-        key_padding_mask: Optional[Tensor],
-    ) -> Tensor:
-        x = self.multihead_attn(
-            x,
-            mem,
-            mem,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask,
-            need_weights=False,
-        )[0]
-        return self.dropout2(x)
-    # feed forward block
-    def _ff_block(self, x: Tensor) -> Tensor:
-        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
-        return self.dropout3(x)
-def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
-    if activation == "relu":
-        return F.relu
-    elif activation == "gelu":
-        return F.gelu
-    raise RuntimeError(
-        "activation should be relu/gelu, not {}".format(activation)
-    )

prompts/promptsf DELETED Viewed

File without changes

utils/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-import torch
-import torch.nn as nn
-# from icefall.utils import make_pad_mask
-from .symbol_table import SymbolTable
-# make_pad_mask = make_pad_mask
-SymbolTable = SymbolTable
-class Transpose(nn.Identity):
-    """(N, T, D) -> (N, D, T)"""
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return input.transpose(1, 2)

utils/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (915 Bytes)

utils/__pycache__/generation.cpython-311.pyc DELETED Viewed

Binary file (15.1 kB)

utils/__pycache__/prompt_making.cpython-311.pyc DELETED Viewed

Binary file (7 kB)

utils/__pycache__/sentence_cutter.cpython-311.pyc DELETED Viewed

Binary file (3.5 kB)

utils/__pycache__/symbol_table.cpython-311.pyc DELETED Viewed

Binary file (12.8 kB)

utils/download.py DELETED Viewed

@@ -1,49 +0,0 @@
-import sys
-import requests
-def download_file_from_google_drive(id, destination):
-    URL = "https://docs.google.com/uc?export=download&confirm=1"
-    session = requests.Session()
-    response = session.get(URL, params={"id": id}, stream=True)
-    token = get_confirm_token(response)
-    if token:
-        params = {"id": id, "confirm": token}
-        response = session.get(URL, params=params, stream=True)
-    save_response_content(response, destination)
-def get_confirm_token(response):
-    for key, value in response.cookies.items():
-        if key.startswith("download_warning"):
-            return value
-    return None
-def save_response_content(response, destination):
-    CHUNK_SIZE = 32768
-    with open(destination, "wb", encoding='utf-8') as f:
-        for chunk in response.iter_content(CHUNK_SIZE):
-            if chunk:  # filter out keep-alive new chunks
-                f.write(chunk)
-def main():
-    if len(sys.argv) >= 3:
-        file_id = sys.argv[1]
-        destination = sys.argv[2]
-    else:
-        file_id = "TAKE_ID_FROM_SHAREABLE_LINK"
-        destination = "DESTINATION_FILE_ON_YOUR_DISK"
-    print(f"dowload {file_id} to {destination}")
-    download_file_from_google_drive(file_id, destination)
-if __name__ == "__main__":
-    main()

utils/g2p/__init__.py DELETED Viewed

@@ -1,72 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-import utils.g2p.cleaners
-from utils.g2p.symbols import symbols
-from tokenizers import Tokenizer
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-class PhonemeBpeTokenizer:
-  def __init__(self, tokenizer_path = "./utils/g2p/bpe_1024.json"):
-    self.tokenizer = Tokenizer.from_file(tokenizer_path)
-  def tokenize(self, text):
-    # 1. convert text to phoneme
-    phonemes, langs = _clean_text(text, ['cje_cleaners'])
-    # 2. replace blank space " " with "_"
-    phonemes = phonemes.replace(" ", "_")
-    # 3. tokenize phonemes
-    phoneme_tokens = self.tokenizer.encode(phonemes).ids
-    assert(len(phoneme_tokens) == len(langs))
-    if not len(phoneme_tokens):
-      raise ValueError("Empty text is given")
-    return phoneme_tokens, langs
-def text_to_sequence(text, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
-  symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  clean_text = _clean_text(text, cleaner_names)
-  for symbol in clean_text:
-    if symbol not in symbol_to_id.keys():
-      continue
-    symbol_id = symbol_to_id[symbol]
-    sequence += [symbol_id]
-  return sequence
-def cleaned_text_to_sequence(cleaned_text):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
-  return sequence
-def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    s = _id_to_symbol[symbol_id]
-    result += s
-  return result
-def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(utils.g2p.cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text, langs = cleaner(text)
-  return text, langs

utils/g2p/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (4.49 kB)

utils/g2p/__pycache__/cleaners.cpython-311.pyc DELETED Viewed

Binary file (4.66 kB)

utils/g2p/__pycache__/english.cpython-311.pyc DELETED Viewed

Binary file (8.53 kB)

utils/g2p/__pycache__/japanese.cpython-311.pyc DELETED Viewed

Binary file (8.34 kB)

utils/g2p/__pycache__/mandarin.cpython-311.pyc DELETED Viewed

Binary file (9.61 kB)

utils/g2p/__pycache__/symbols.cpython-311.pyc DELETED Viewed

Binary file (1.5 kB)