Spaces:
Paused
Paused
File size: 10,760 Bytes
3483284 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
import gguf
import torch
import comfy.ops
import comfy.model_management
from .dequant import dequantize_tensor, is_quantized
# to avoid breaking really old pytorch versions
if hasattr(torch, "compiler") and hasattr(torch.compiler, "disable"):
torch_compiler_disable = torch.compiler.disable
else:
def torch_compiler_disable(*args, **kwargs):
def noop(x):
return x
return noop
class GGMLTensor(torch.Tensor):
"""
Main tensor-like class for storing quantized weights
"""
def __init__(self, *args, tensor_type, tensor_shape, patches=[], **kwargs):
super().__init__()
self.tensor_type = tensor_type
self.tensor_shape = tensor_shape
self.patches = patches
def __new__(cls, *args, tensor_type, tensor_shape, patches=[], **kwargs):
return super().__new__(cls, *args, **kwargs)
def to(self, *args, **kwargs):
new = super().to(*args, **kwargs)
new.tensor_type = getattr(self, "tensor_type", None)
new.tensor_shape = getattr(self, "tensor_shape", new.data.shape)
new.patches = getattr(self, "patches", []).copy()
return new
def clone(self, *args, **kwargs):
return self
def detach(self, *args, **kwargs):
return self
def copy_(self, *args, **kwargs):
# fixes .weight.copy_ in comfy/clip_model/CLIPTextModel
try:
return super().copy_(*args, **kwargs)
except Exception as e:
print(f"ignoring 'copy_' on tensor: {e}")
def new_empty(self, size, *args, **kwargs):
# Intel Arc fix, ref#50
new_tensor = super().new_empty(size, *args, **kwargs)
return GGMLTensor(
new_tensor,
tensor_type = getattr(self, "tensor_type", None),
tensor_shape = size,
patches = getattr(self, "patches", []).copy()
)
@property
def shape(self):
if not hasattr(self, "tensor_shape"):
self.tensor_shape = self.size()
return self.tensor_shape
class GGMLLayer(torch.nn.Module):
"""
This (should) be responsible for de-quantizing on the fly
"""
comfy_cast_weights = True
dequant_dtype = None
patch_dtype = None
largest_layer = False
torch_compatible_tensor_types = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}
def is_ggml_quantized(self, *, weight=None, bias=None):
if weight is None:
weight = self.weight
if bias is None:
bias = self.bias
return is_quantized(weight) or is_quantized(bias)
def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
weight, bias = state_dict.get(f"{prefix}weight"), state_dict.get(f"{prefix}bias")
# NOTE: using modified load for linear due to not initializing on creation, see GGMLOps todo
if self.is_ggml_quantized(weight=weight, bias=bias) or isinstance(self, torch.nn.Linear):
return self.ggml_load_from_state_dict(state_dict, prefix, *args, **kwargs)
return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
def ggml_load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
prefix_len = len(prefix)
for k,v in state_dict.items():
if k[prefix_len:] == "weight":
self.weight = torch.nn.Parameter(v, requires_grad=False)
elif k[prefix_len:] == "bias" and v is not None:
self.bias = torch.nn.Parameter(v, requires_grad=False)
else:
unexpected_keys.append(k)
# For Linear layer with missing weight
if self.weight is None and isinstance(self, torch.nn.Linear):
v = torch.zeros(self.in_features, self.out_features)
self.weight = torch.nn.Parameter(v, requires_grad=False)
missing_keys.append(prefix+"weight")
# for vram estimation (TODO: less fragile logic?)
if getattr(self.weight, "is_largest_weight", False):
self.largest_layer = True
def _save_to_state_dict(self, *args, **kwargs):
if self.is_ggml_quantized():
return self.ggml_save_to_state_dict(*args, **kwargs)
return super()._save_to_state_dict(*args, **kwargs)
def ggml_save_to_state_dict(self, destination, prefix, keep_vars):
# This is a fake state dict for vram estimation
weight = torch.zeros_like(self.weight, device=torch.device("meta"))
destination[prefix + "weight"] = weight
if self.bias is not None:
bias = torch.zeros_like(self.bias, device=torch.device("meta"))
destination[prefix + "bias"] = bias
# Take into account space required for dequantizing the largest tensor
if self.largest_layer:
shape = getattr(self.weight, "tensor_shape", self.weight.shape)
dtype = self.dequant_dtype or torch.float16
temp = torch.empty(*shape, device=torch.device("meta"), dtype=dtype)
destination[prefix + "temp.weight"] = temp
return
# This would return the dequantized state dict
destination[prefix + "weight"] = self.get_weight(self.weight)
if bias is not None:
destination[prefix + "bias"] = self.get_weight(self.bias)
def get_weight(self, tensor, dtype):
if tensor is None:
return
# consolidate and load patches to GPU in async
patch_list = []
device = tensor.device
for function, patches, key in getattr(tensor, "patches", []):
patch_list += move_patch_to_device(patches, device)
# dequantize tensor while patches load
weight = dequantize_tensor(tensor, dtype, self.dequant_dtype)
# prevent propagating custom tensor class
if isinstance(weight, GGMLTensor):
weight.__class__ = torch.Tensor
# apply patches
if patch_list:
if self.patch_dtype is None:
weight = function(patch_list, weight, key)
else:
# for testing, may degrade image quality
patch_dtype = dtype if self.patch_dtype == "target" else self.patch_dtype
weight = function(patch_list, weight, key, patch_dtype)
return weight
@torch_compiler_disable()
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
if input is not None:
if dtype is None:
dtype = getattr(input, "dtype", torch.float32)
if bias_dtype is None:
bias_dtype = dtype
if device is None:
device = input.device
bias = None
non_blocking = comfy.model_management.device_supports_non_blocking(device)
if s.bias is not None:
bias = s.get_weight(s.bias.to(device), dtype)
bias = comfy.ops.cast_to(bias, bias_dtype, device, non_blocking=non_blocking, copy=False)
weight = s.get_weight(s.weight.to(device), dtype)
weight = comfy.ops.cast_to(weight, dtype, device, non_blocking=non_blocking, copy=False)
return weight, bias
def forward_comfy_cast_weights(self, input, *args, **kwargs):
if self.is_ggml_quantized():
out = self.forward_ggml_cast_weights(input, *args, **kwargs)
else:
out = super().forward_comfy_cast_weights(input, *args, **kwargs)
# non-ggml forward might still propagate custom tensor class
if isinstance(out, GGMLTensor):
out.__class__ = torch.Tensor
return out
def forward_ggml_cast_weights(self, input):
raise NotImplementedError
class GGMLOps(comfy.ops.manual_cast):
"""
Dequantize weights on the fly before doing the compute
"""
class Linear(GGMLLayer, comfy.ops.manual_cast.Linear):
def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
torch.nn.Module.__init__(self)
# TODO: better workaround for reserved memory spike on windows
# Issue is with `torch.empty` still reserving the full memory for the layer
# Windows doesn't over-commit memory so without this 24GB+ of pagefile is used
self.in_features = in_features
self.out_features = out_features
self.weight = None
self.bias = None
def forward_ggml_cast_weights(self, input):
weight, bias = self.cast_bias_weight(input)
return torch.nn.functional.linear(input, weight, bias)
class Conv2d(GGMLLayer, comfy.ops.manual_cast.Conv2d):
def forward_ggml_cast_weights(self, input):
weight, bias = self.cast_bias_weight(input)
return self._conv_forward(input, weight, bias)
class Embedding(GGMLLayer, comfy.ops.manual_cast.Embedding):
def forward_ggml_cast_weights(self, input, out_dtype=None):
output_dtype = out_dtype
if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
out_dtype = None
weight, _bias = self.cast_bias_weight(self, device=input.device, dtype=out_dtype)
return torch.nn.functional.embedding(
input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse
).to(dtype=output_dtype)
class LayerNorm(GGMLLayer, comfy.ops.manual_cast.LayerNorm):
def forward_ggml_cast_weights(self, input):
if self.weight is None:
return super().forward_comfy_cast_weights(input)
weight, bias = self.cast_bias_weight(input)
return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
class GroupNorm(GGMLLayer, comfy.ops.manual_cast.GroupNorm):
def forward_ggml_cast_weights(self, input):
weight, bias = self.cast_bias_weight(input)
return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)
def move_patch_to_device(item, device):
if isinstance(item, torch.Tensor):
return item.to(device, non_blocking=True)
elif isinstance(item, tuple):
return tuple(move_patch_to_device(x, device) for x in item)
elif isinstance(item, list):
return [move_patch_to_device(x, device) for x in item]
else:
return item
|