Spaces:

Zeyue7
/

AudioX

Running on Zero

App Files Files Community

AudioX / stable_audio_tools /models /local_attention.py

Zeyue7

AudioX

8ab1cf8 13 days ago

raw

history blame contribute delete

8.29 kB

	import torch

	from einops import rearrange
	from torch import nn

	from .blocks import AdaRMSNorm
	from .transformer import Attention, FeedForward, RotaryEmbedding, LayerNorm

	def checkpoint(function, args, *kwargs):
	kwargs.setdefault("use_reentrant", False)
	return torch.utils.checkpoint.checkpoint(function, args, *kwargs)

	# Adapted from https://github.com/lucidrains/local-attention/blob/master/local_attention/transformer.py
	class ContinuousLocalTransformer(nn.Module):
	def __init__(
	self,
	*,
	dim,
	depth,
	dim_in = None,
	dim_out = None,
	causal = False,
	local_attn_window_size = 64,
	heads = 8,
	ff_mult = 2,
	cond_dim = 0,
	cross_attn_cond_dim = 0,
	**kwargs
	):
	super().__init__()

	dim_head = dim//heads

	self.layers = nn.ModuleList([])

	self.project_in = nn.Linear(dim_in, dim) if dim_in is not None else nn.Identity()

	self.project_out = nn.Linear(dim, dim_out) if dim_out is not None else nn.Identity()

	self.local_attn_window_size = local_attn_window_size

	self.cond_dim = cond_dim

	self.cross_attn_cond_dim = cross_attn_cond_dim

	self.rotary_pos_emb = RotaryEmbedding(max(dim_head // 2, 32))

	for _ in range(depth):

	self.layers.append(nn.ModuleList([
	AdaRMSNorm(dim, cond_dim, eps=1e-8) if cond_dim > 0 else LayerNorm(dim),
	Attention(
	dim=dim,
	dim_heads=dim_head,
	causal=causal,
	zero_init_output=True,
	natten_kernel_size=local_attn_window_size,
	),
	Attention(
	dim=dim,
	dim_heads=dim_head,
	dim_context = cross_attn_cond_dim,
	zero_init_output=True
	) if self.cross_attn_cond_dim > 0 else nn.Identity(),
	AdaRMSNorm(dim, cond_dim, eps=1e-8) if cond_dim > 0 else LayerNorm(dim),
	FeedForward(dim = dim, mult = ff_mult, no_bias=True)
	]))

	def forward(self, x, mask = None, cond = None, cross_attn_cond = None, cross_attn_cond_mask = None, prepend_cond = None):

	x = checkpoint(self.project_in, x)

	if prepend_cond is not None:
	x = torch.cat([prepend_cond, x], dim=1)

	pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1])

	for attn_norm, attn, xattn, ff_norm, ff in self.layers:

	residual = x
	if cond is not None:
	x = checkpoint(attn_norm, x, cond)
	else:
	x = checkpoint(attn_norm, x)

	x = checkpoint(attn, x, mask = mask, rotary_pos_emb=pos_emb) + residual

	if cross_attn_cond is not None:
	x = checkpoint(xattn, x, context=cross_attn_cond, context_mask=cross_attn_cond_mask) + x

	residual = x

	if cond is not None:
	x = checkpoint(ff_norm, x, cond)
	else:
	x = checkpoint(ff_norm, x)

	x = checkpoint(ff, x) + residual

	return checkpoint(self.project_out, x)

	class TransformerDownsampleBlock1D(nn.Module):
	def __init__(
	self,
	in_channels,
	embed_dim = 768,
	depth = 3,
	heads = 12,
	downsample_ratio = 2,
	local_attn_window_size = 64,
	**kwargs
	):
	super().__init__()

	self.downsample_ratio = downsample_ratio

	self.transformer = ContinuousLocalTransformer(
	dim=embed_dim,
	depth=depth,
	heads=heads,
	local_attn_window_size=local_attn_window_size,
	**kwargs
	)

	self.project_in = nn.Linear(in_channels, embed_dim, bias=False) if in_channels != embed_dim else nn.Identity()

	self.project_down = nn.Linear(embed_dim * self.downsample_ratio, embed_dim, bias=False)


	def forward(self, x):

	x = checkpoint(self.project_in, x)

	# Compute
	x = self.transformer(x)

	# Trade sequence length for channels
	x = rearrange(x, "b (n r) c -> b n (c r)", r=self.downsample_ratio)

	# Project back to embed dim
	x = checkpoint(self.project_down, x)

	return x

	class TransformerUpsampleBlock1D(nn.Module):
	def __init__(
	self,
	in_channels,
	embed_dim,
	depth = 3,
	heads = 12,
	upsample_ratio = 2,
	local_attn_window_size = 64,
	**kwargs
	):
	super().__init__()

	self.upsample_ratio = upsample_ratio

	self.transformer = ContinuousLocalTransformer(
	dim=embed_dim,
	depth=depth,
	heads=heads,
	local_attn_window_size = local_attn_window_size,
	**kwargs
	)

	self.project_in = nn.Linear(in_channels, embed_dim, bias=False) if in_channels != embed_dim else nn.Identity()

	self.project_up = nn.Linear(embed_dim, embed_dim * self.upsample_ratio, bias=False)

	def forward(self, x):

	# Project to embed dim
	x = checkpoint(self.project_in, x)

	# Project to increase channel dim
	x = checkpoint(self.project_up, x)

	# Trade channels for sequence length
	x = rearrange(x, "b n (c r) -> b (n r) c", r=self.upsample_ratio)

	# Compute
	x = self.transformer(x)

	return x


	class TransformerEncoder1D(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	embed_dims = [96, 192, 384, 768],
	heads = [12, 12, 12, 12],
	depths = [3, 3, 3, 3],
	ratios = [2, 2, 2, 2],
	local_attn_window_size = 64,
	**kwargs
	):
	super().__init__()

	layers = []

	for layer in range(len(depths)):
	prev_dim = embed_dims[layer - 1] if layer > 0 else embed_dims[0]

	layers.append(
	TransformerDownsampleBlock1D(
	in_channels = prev_dim,
	embed_dim = embed_dims[layer],
	heads = heads[layer],
	depth = depths[layer],
	downsample_ratio = ratios[layer],
	local_attn_window_size = local_attn_window_size,
	**kwargs
	)
	)

	self.layers = nn.Sequential(*layers)

	self.project_in = nn.Linear(in_channels, embed_dims[0], bias=False)
	self.project_out = nn.Linear(embed_dims[-1], out_channels, bias=False)

	def forward(self, x):
	x = rearrange(x, "b c n -> b n c")
	x = checkpoint(self.project_in, x)
	x = self.layers(x)
	x = checkpoint(self.project_out, x)
	x = rearrange(x, "b n c -> b c n")

	return x


	class TransformerDecoder1D(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	embed_dims = [768, 384, 192, 96],
	heads = [12, 12, 12, 12],
	depths = [3, 3, 3, 3],
	ratios = [2, 2, 2, 2],
	local_attn_window_size = 64,
	**kwargs
	):

	super().__init__()

	layers = []

	for layer in range(len(depths)):
	prev_dim = embed_dims[layer - 1] if layer > 0 else embed_dims[0]

	layers.append(
	TransformerUpsampleBlock1D(
	in_channels = prev_dim,
	embed_dim = embed_dims[layer],
	heads = heads[layer],
	depth = depths[layer],
	upsample_ratio = ratios[layer],
	local_attn_window_size = local_attn_window_size,
	**kwargs
	)
	)

	self.layers = nn.Sequential(*layers)

	self.project_in = nn.Linear(in_channels, embed_dims[0], bias=False)
	self.project_out = nn.Linear(embed_dims[-1], out_channels, bias=False)

	def forward(self, x):
	x = rearrange(x, "b c n -> b n c")
	x = checkpoint(self.project_in, x)
	x = self.layers(x)
	x = checkpoint(self.project_out, x)
	x = rearrange(x, "b n c -> b c n")
	return x