Spaces:

yslan
/

3DEnhancer

Running on Zero

App Files Files Community

3DEnhancer / src /diffusion /model /nets /encoder.py

Luo-Yihang

initial code

4c35d22 about 2 months ago

raw

history blame

25.1 kB

	from typing import Optional, Any
	from inspect import isfunction
	import numbers

	import torch
	import torch.nn as nn
	from torch import einsum
	import torch.nn.functional as F
	from einops import rearrange, repeat

	try:
	import xformers
	import xformers.ops
	XFORMERS_IS_AVAILBLE = True
	except:
	XFORMERS_IS_AVAILBLE = False
	print("No module 'xformers'. Proceeding without it.")


	class Downsample(nn.Module):
	def __init__(self, in_channels, with_conv):
	super().__init__()
	self.with_conv = with_conv
	if self.with_conv:
	# no asymmetric padding in torch conv, must do it ourselves
	self.conv = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=3,
	stride=2,
	padding=0)

	def forward(self, x):
	if self.with_conv:
	pad = (0,1,0,1)
	x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
	x = self.conv(x)
	else:
	x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
	return x


	def nonlinearity(x):
	# swish
	return x*torch.sigmoid(x)


	def Normalize(in_channels, num_groups=32):
	return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)


	class AttnBlock(nn.Module):
	def __init__(self, in_channels):
	super().__init__()
	self.in_channels = in_channels

	self.norm = Normalize(in_channels)
	self.q = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.k = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.v = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.proj_out = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)

	def forward(self, x):
	h_ = x
	h_ = self.norm(h_)
	q = self.q(h_)
	k = self.k(h_)
	v = self.v(h_)

	# compute attention
	b,c,h,w = q.shape
	q = q.reshape(b,c,h*w)
	q = q.permute(0,2,1) # b,hw,c
	k = k.reshape(b,c,h*w) # b,c,hw
	w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
	w_ = w_ * (int(c)**(-0.5))
	w_ = torch.nn.functional.softmax(w_, dim=2)

	# attend to values
	v = v.reshape(b,c,h*w)
	w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
	h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
	h_ = h_.reshape(b,c,h,w)

	h_ = self.proj_out(h_)

	return x+h_


	class MemoryEfficientAttnBlock(nn.Module):
	"""
	Uses xformers efficient implementation,
	see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
	Note: this is a single-head self-attention operation
	"""
	#
	def __init__(self, in_channels):
	super().__init__()
	self.in_channels = in_channels

	self.norm = Normalize(in_channels)
	self.q = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.k = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.v = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.proj_out = torch.nn.Conv2d(in_channels,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0)
	self.attention_op: Optional[Any] = None

	def forward(self, x):
	h_ = x
	h_ = self.norm(h_)
	q = self.q(h_)
	k = self.k(h_)
	v = self.v(h_)

	# compute attention
	B, C, H, W = q.shape
	q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))

	q, k, v = map(
	lambda t: t.unsqueeze(3)
	.reshape(B, t.shape[1], 1, C)
	.permute(0, 2, 1, 3)
	.reshape(B * 1, t.shape[1], C)
	.contiguous(),
	(q, k, v),
	)
	out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)

	out = (
	out.unsqueeze(0)
	.reshape(B, 1, out.shape[1], C)
	.permute(0, 2, 1, 3)
	.reshape(B, out.shape[1], C)
	)
	out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
	out = self.proj_out(out)
	return x+out


	def exists(val):
	return val is not None


	def zero_module(module):
	"""
	Zero out the parameters of a module and return it.
	"""
	for p in module.parameters():
	p.detach().zero_()
	return module


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	class RMSNorm(nn.Module):
	def __init__(self, dim, eps: float, elementwise_affine: bool = True):
	super().__init__()

	self.eps = eps

	if isinstance(dim, numbers.Integral):
	dim = (dim,)

	self.dim = torch.Size(dim)

	if elementwise_affine:
	self.weight = nn.Parameter(torch.ones(dim))
	else:
	self.weight = None

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.eps)

	if self.weight is not None:
	# convert into half-precision if necessary
	if self.weight.dtype in [torch.float16, torch.bfloat16]:
	hidden_states = hidden_states.to(self.weight.dtype)
	hidden_states = hidden_states * self.weight
	else:
	hidden_states = hidden_states.to(input_dtype)

	return hidden_states.to(input_dtype)


	class GEGLU(nn.Module):
	def __init__(self, dim_in, dim_out):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out * 2)

	def forward(self, x):
	x, gate = self.proj(x).chunk(2, dim=-1)
	return x * F.gelu(gate)


	class FeedForward(nn.Module):
	def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = default(dim_out, dim)
	project_in = nn.Sequential(
	nn.Linear(dim, inner_dim),
	nn.GELU()
	) if not glu else GEGLU(dim, inner_dim)

	self.net = nn.Sequential(
	project_in,
	nn.Dropout(dropout),
	nn.Linear(inner_dim, dim_out)
	)

	def forward(self, x):
	return self.net(x)


	class CrossAttention(nn.Module):
	def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
	super().__init__()
	inner_dim = dim_head * heads
	context_dim = default(context_dim, query_dim)

	self.scale = dim_head ** -0.5
	self.heads = heads

	self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
	self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
	self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

	self.to_out = nn.Sequential(
	nn.Linear(inner_dim, query_dim),
	nn.Dropout(dropout)
	)

	def forward(self, x, context=None, mask=None):
	h = self.heads

	q = self.to_q(x)
	context = default(context, x)
	k = self.to_k(context)
	v = self.to_v(context)

	q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))

	sim = einsum('b i d, b j d -> b i j', q, k) * self.scale

	if exists(mask):
	mask = rearrange(mask, 'b ... -> b (...)')
	max_neg_value = -torch.finfo(sim.dtype).max
	mask = repeat(mask, 'b j -> (b h) () j', h=h)
	sim.masked_fill_(~mask, max_neg_value)

	# attention, what we cannot get enough of
	attn = sim.softmax(dim=-1)

	out = einsum('b i j, b j d -> b i d', attn, v)
	out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
	return self.to_out(out)


	class MemoryEfficientCrossAttention(nn.Module):
	# https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
	def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, enable_rmsnorm=False, qk_norm=False):
	super().__init__()
	# print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
	# f"{heads} heads.")
	inner_dim = dim_head * heads
	context_dim = default(context_dim, query_dim)

	self.heads = heads
	self.dim_head = dim_head

	self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
	self.to_k = nn.Linear(context_dim, inner_dim, bias=False)

	# if enable_rmsnorm:
	# self.q_rmsnorm = RMSNorm(query_dim, eps=1e-5)
	# self.k_rmsnorm = RMSNorm(context_dim, eps=1e-5)

	self.q_norm = RMSNorm(self.dim_head, elementwise_affine=True, eps=1e-5) if qk_norm else nn.Identity()
	self.k_norm = RMSNorm(self.dim_head, elementwise_affine=True, eps=1e-5) if qk_norm else nn.Identity()

	# self.enable_rmsnorm = enable_rmsnorm

	self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
	# self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
	# self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

	self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
	self.attention_op: Optional[Any] = None
	# self.attention_op: Optional[Any] = MemoryEfficientAttentionFlashAttentionOp


	def forward(self, x, context=None, mask=None):
	q = self.to_q(x)
	context = default(context, x)
	k = self.to_k(context)

	v = self.to_v(context)

	b, _, _ = q.shape
	q, k, v = map(
	lambda t: t.unsqueeze(3)
	.reshape(b, t.shape[1], self.heads, self.dim_head)
	.permute(0, 2, 1, 3)
	.reshape(b * self.heads, t.shape[1], self.dim_head)
	.contiguous(),
	(q, k, v),
	)
	q, k = self.q_norm(q), self.k_norm(k) # for stable amp training

	# actually compute the attention, what we cannot get enough of
	out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)

	if exists(mask):
	raise NotImplementedError
	out = (
	out.unsqueeze(0)
	.reshape(b, self.heads, out.shape[1], self.dim_head)
	.permute(0, 2, 1, 3)
	.reshape(b, out.shape[1], self.heads * self.dim_head)
	)
	return self.to_out(out)


	class BasicTransformerBlock(nn.Module):
	ATTENTION_MODES = {
	"softmax": CrossAttention, # vanilla attention
	"softmax-xformers": MemoryEfficientCrossAttention
	}
	def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
	disable_self_attn=False):
	super().__init__()
	attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
	assert attn_mode in self.ATTENTION_MODES
	attn_cls = self.ATTENTION_MODES[attn_mode]
	self.disable_self_attn = disable_self_attn
	self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
	context_dim=context_dim if self.disable_self_attn else None) # is a self-attention if not self.disable_self_attn
	self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
	self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
	heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
	self.norm1 = nn.LayerNorm(dim)
	self.norm2 = nn.LayerNorm(dim)
	self.norm3 = nn.LayerNorm(dim)
	self.checkpoint = checkpoint

	def forward(self, x, context=None):
	# return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
	return self._forward(x, context)

	def _forward(self, x, context=None):
	x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
	x = self.attn2(self.norm2(x), context=context) + x
	x = self.ff(self.norm3(x)) + x
	return x


	class BasicTransformerBlock3D(BasicTransformerBlock):

	def forward(self, x, context=None, num_frames=1):
	# return checkpoint(self._forward, (x, context, num_frames), self.parameters(), self.checkpoint)
	return self._forward(x, context, num_frames) # , self.parameters(), self.checkpoint

	def _forward(self, x, context=None, num_frames=1):
	x = rearrange(x, "(b f) l c -> b (f l) c", f=num_frames).contiguous()
	x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
	x = rearrange(x, "b (f l) c -> (b f) l c", f=num_frames).contiguous()
	x = self.attn2(self.norm2(x), context=context) + x
	x = self.ff(self.norm3(x)) + x
	return x


	class SpatialTransformer3D(nn.Module):
	''' 3D self-attention '''
	def __init__(self, in_channels, n_heads, d_head,
	depth=1, dropout=0., context_dim=None,
	disable_self_attn=False, use_linear=False,
	use_checkpoint=True):
	super().__init__()
	if exists(context_dim) and not isinstance(context_dim, list):
	context_dim = [context_dim]
	elif context_dim is None:
	context_dim = [None] * depth

	self.in_channels = in_channels
	inner_dim = n_heads * d_head
	self.norm = Normalize(in_channels)
	if not use_linear:
	self.proj_in = nn.Conv2d(in_channels,
	inner_dim,
	kernel_size=1,
	stride=1,
	padding=0)
	else:
	self.proj_in = nn.Linear(in_channels, inner_dim)

	self.transformer_blocks = nn.ModuleList(
	[BasicTransformerBlock3D(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
	disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
	for d in range(depth)]
	)
	if not use_linear:
	self.proj_out = zero_module(nn.Conv2d(inner_dim,
	in_channels,
	kernel_size=1,
	stride=1,
	padding=0))
	else:
	self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
	self.use_linear = use_linear

	def forward(self, x, context=None, num_frames=1):
	# note: if no context is given, cross-attention defaults to self-attention
	if not isinstance(context, list):
	context = [context]
	b, c, h, w = x.shape
	x_in = x
	x = self.norm(x)
	if not self.use_linear:
	x = self.proj_in(x)
	x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
	if self.use_linear:
	x = self.proj_in(x)
	for i, block in enumerate(self.transformer_blocks):
	x = block(x, context=context[i], num_frames=num_frames)
	if self.use_linear:
	x = self.proj_out(x)
	x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
	if not self.use_linear:
	x = self.proj_out(x)
	return x + x_in


	def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
	assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none", "mv-vanilla"], f'attn_type {attn_type} unknown'
	if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
	attn_type = "vanilla-xformers"
	# print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
	if attn_type == "vanilla":
	assert attn_kwargs is None
	return AttnBlock(in_channels)
	elif attn_type == "mv-vanilla":
	assert attn_kwargs is not None
	return SpatialTransformer3D(in_channels, **attn_kwargs)
	elif attn_type == "vanilla-xformers":
	print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
	return MemoryEfficientAttnBlock(in_channels)
	elif attn_type == "none":
	return nn.Identity(in_channels)
	else:
	raise NotImplementedError()


	class ResnetBlock(nn.Module):
	def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
	dropout, temb_channels=0):
	super().__init__()
	self.in_channels = in_channels
	out_channels = in_channels if out_channels is None else out_channels
	self.out_channels = out_channels
	self.use_conv_shortcut = conv_shortcut

	self.norm1 = Normalize(in_channels)
	self.conv1 = torch.nn.Conv2d(in_channels,
	out_channels,
	kernel_size=3,
	stride=1,
	padding=1)
	if temb_channels > 0:
	self.temb_proj = torch.nn.Linear(temb_channels,
	out_channels)
	self.norm2 = Normalize(out_channels)
	self.dropout = torch.nn.Dropout(dropout)
	self.conv2 = torch.nn.Conv2d(out_channels,
	out_channels,
	kernel_size=3,
	stride=1,
	padding=1)
	if self.in_channels != self.out_channels:
	if self.use_conv_shortcut:
	self.conv_shortcut = torch.nn.Conv2d(in_channels,
	out_channels,
	kernel_size=3,
	stride=1,
	padding=1)
	else:
	self.nin_shortcut = torch.nn.Conv2d(in_channels,
	out_channels,
	kernel_size=1,
	stride=1,
	padding=0)

	def forward(self, x, temb=None):
	h = x
	h = self.norm1(h)
	h = nonlinearity(h)
	h = self.conv1(h)

	if temb is not None:
	h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]

	h = self.norm2(h)
	h = nonlinearity(h)
	h = self.dropout(h)
	h = self.conv2(h)

	if self.in_channels != self.out_channels:
	if self.use_conv_shortcut:
	x = self.conv_shortcut(x)
	else:
	x = self.nin_shortcut(x)

	return x+h


	class Encoder(nn.Module):
	def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
	attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
	resolution,
	z_channels, double_z=True,
	use_linear_attn=False, attn_type="vanilla",
	attn_kwargs={},
	z_downsample_size=1,
	**ignore_kwargs):
	super().__init__()
	if use_linear_attn: attn_type = "linear"
	self.ch = ch
	self.temb_ch = 0
	self.num_resolutions = len(ch_mult)
	self.num_res_blocks = num_res_blocks
	self.resolution = resolution
	self.in_channels = in_channels

	# downsampling
	self.conv_in = torch.nn.Conv2d(in_channels,
	self.ch,
	kernel_size=3,
	stride=1,
	padding=1)

	curr_res = resolution
	in_ch_mult = (1,)+tuple(ch_mult)
	self.in_ch_mult = in_ch_mult
	self.down = nn.ModuleList()
	for i_level in range(self.num_resolutions):
	block = nn.ModuleList()
	attn = nn.ModuleList()
	block_in = ch*in_ch_mult[i_level]
	block_out = ch*ch_mult[i_level]
	for i_block in range(self.num_res_blocks):
	block.append(ResnetBlock(in_channels=block_in,
	out_channels=block_out,
	temb_channels=self.temb_ch,
	dropout=dropout))
	block_in = block_out
	if curr_res in attn_resolutions:
	attn.append(make_attn(block_in, attn_type=attn_type, attn_kwargs=attn_kwargs))
	down = nn.Module()
	down.block = block
	down.attn = attn
	if i_level != self.num_resolutions-1:
	down.downsample = Downsample(block_in, resamp_with_conv)
	curr_res = curr_res // 2
	self.down.append(down)

	# middle
	self.mid = nn.Module()
	self.mid.block_1 = ResnetBlock(in_channels=block_in,
	out_channels=block_in,
	temb_channels=self.temb_ch,
	dropout=dropout)
	self.mid.attn_1 = make_attn(block_in, attn_type=attn_type, attn_kwargs=attn_kwargs)
	self.mid.block_2 = ResnetBlock(in_channels=block_in,
	out_channels=block_in,
	temb_channels=self.temb_ch,
	dropout=dropout)

	# end
	self.norm_out = Normalize(block_in)
	self.conv_out = torch.nn.Conv2d(block_in,
	2*z_channels if double_z else z_channels,
	kernel_size=3,
	stride=z_downsample_size,
	padding=1)

	def forward(self, x, **kwargs):
	# timestep embedding
	temb = None

	# downsampling
	h = self.conv_in(x)
	for i_level in range(self.num_resolutions):
	for i_block in range(self.num_res_blocks):
	h = self.down[i_level].block[i_block](h, temb)
	if len(self.down[i_level].attn) > 0:
	h = self.down[i_level].attn[i_block](h)
	if i_level != self.num_resolutions-1:
	h = (self.down[i_level].downsample(h))

	# middle
	h = self.mid.block_1(h, temb)
	h = self.mid.attn_1(h, **kwargs)
	h = self.mid.block_2(h, temb)

	# end
	h = self.norm_out(h)
	h = nonlinearity(h)
	h = self.conv_out(h)
	return h


	class MVEncoder(Encoder):
	def __init__(self, , ch, out_ch, ch_mult=(1, 2, 4, 8), num_res_blocks, attn_resolutions, dropout=0, resamp_with_conv=True, in_channels, resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="mv-vanilla", z_downsample_size=1, *ignore_kwargs):
	super().__init__(ch=ch, out_ch=out_ch, ch_mult=ch_mult, num_res_blocks=num_res_blocks, attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv, in_channels=in_channels, resolution=resolution, z_channels=z_channels, double_z=double_z, use_linear_attn=use_linear_attn, attn_type=attn_type,
	z_downsample_size=z_downsample_size,
	add_fusion_layer=False,
	**ignore_kwargs)

	def forward(self, x, n_views):
	return super().forward(x, num_frames=n_views)