Spaces:
Paused
Paused
File size: 14,709 Bytes
3483284 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index de3c706f..0267c1fa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -223,7 +223,7 @@
#define GGML_MAX_OP_PARAMS 64
#ifndef GGML_MAX_NAME
-# define GGML_MAX_NAME 64
+# define GGML_MAX_NAME 128
#endif
#define GGML_DEFAULT_N_THREADS 4
@@ -2449,6 +2449,7 @@ extern "C" {
// manage tensor info
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+ GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b16c462f..6d1568f1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
ctx->header.n_tensors++;
}
+void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
+ const int idx = gguf_find_tensor(ctx, name);
+ if (idx < 0) {
+ GGML_ABORT("tensor not found");
+ }
+ ctx->infos[idx].n_dims = n_dim;
+}
+
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
diff --git a/src/llama.cpp b/src/llama.cpp
index 24e1f1f0..aeccc173 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -205,6 +205,11 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
+ LLM_ARCH_FLUX,
+ LLM_ARCH_SD1,
+ LLM_ARCH_SDXL,
+ LLM_ARCH_SD3,
+ LLM_ARCH_AURA,
LLM_ARCH_UNKNOWN,
};
@@ -258,6 +263,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
+ { LLM_ARCH_FLUX, "flux" },
+ { LLM_ARCH_SD1, "sd1" },
+ { LLM_ARCH_SDXL, "sdxl" },
+ { LLM_ARCH_SD3, "sd3" },
+ { LLM_ARCH_AURA, "aura" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -1531,6 +1541,11 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
+ { LLM_ARCH_FLUX, {}},
+ { LLM_ARCH_SD1, {}},
+ { LLM_ARCH_SDXL, {}},
+ { LLM_ARCH_SD3, {}},
+ { LLM_ARCH_AURA, {}},
{
LLM_ARCH_UNKNOWN,
{
@@ -5403,6 +5418,12 @@ static void llm_load_hparams(
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
+ // Disable LLM metadata for image models
+ if (model.arch == LLM_ARCH_FLUX || model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL || model.arch == LLM_ARCH_SD3 || model.arch == LLM_ARCH_AURA) {
+ model.ftype = ml.ftype;
+ return;
+ }
+
// get hparams kv
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
@@ -18016,6 +18037,125 @@ static void llama_tensor_dequantize_internal(
workers.clear();
}
+static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+ // Special function for quantizing image model tensors
+ const std::string name = ggml_get_name(tensor);
+ const llm_arch arch = qs.model.arch;
+
+ // Sanity check
+ if (
+ (name.find("model.diffusion_model.") != std::string::npos) ||
+ (name.find("first_stage_model.") != std::string::npos) ||
+ (name.find("single_transformer_blocks.") != std::string::npos) ||
+ (name.find("joint_transformer_blocks.") != std::string::npos)
+ ) {
+ throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
+ }
+
+ // Unsupported quant types - exclude all IQ quants for now
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
+ throw std::runtime_error("Invalid quantization type for image model (Not supported)");
+ }
+
+ if ( // Rules for to_v attention
+ (name.find("attn_v.weight") != std::string::npos) ||
+ (name.find(".to_v.weight") != std::string::npos) ||
+ (name.find(".attn.w1v.weight") != std::string::npos) ||
+ (name.find(".attn.w2v.weight") != std::string::npos)
+ ){
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+ new_type = GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+ new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ ++qs.i_attention_wv;
+ } else if ( // Rules for fused qkv attention
+ (name.find("attn_qkv.weight") != std::string::npos) ||
+ (name.find("attn.qkv.weight") != std::string::npos)
+ ) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ } else if ( // Rules for ffn
+ (name.find("ffn_down") != std::string::npos)
+ ) {
+ // TODO: add back `layer_info` with some model specific logic + logic further down
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
+ new_type = GGML_TYPE_Q4_1;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
+ new_type = GGML_TYPE_Q5_1;
+ }
+ ++qs.i_ffn_down;
+ }
+
+ // Sanity check for row shape
+ bool convert_incompatible_tensor = false;
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
+ int nx = tensor->ne[0];
+ int ny = tensor->ne[1];
+ if (nx % QK_K != 0) {
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+ convert_incompatible_tensor = true;
+ } else {
+ ++qs.n_k_quantized;
+ }
+ }
+ if (convert_incompatible_tensor) {
+ // TODO: Possibly reenable this in the future
+ // switch (new_type) {
+ // case GGML_TYPE_Q2_K:
+ // case GGML_TYPE_Q3_K:
+ // case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+ // case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+ // case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+ // default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+ // }
+ new_type = GGML_TYPE_F16;
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+ ++qs.n_fallback;
+ }
+ return new_type;
+}
+
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
@@ -18547,6 +18687,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
ctx_outs[i_split] = gguf_init_empty();
}
gguf_add_tensor(ctx_outs[i_split], tensor);
+ // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
+ if (model.arch == LLM_ARCH_SD3) {
+ const std::string name = ggml_get_name(tensor);
+ if (name == "pos_embed" && tensor->ne[2] == 1) {
+ const int n_dim = 3;
+ gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
+ LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
+ }
+ }
+ // same goes for auraflow
+ if (model.arch == LLM_ARCH_AURA) {
+ const std::string name = ggml_get_name(tensor);
+ if (name == "positional_encoding" && tensor->ne[2] == 1) {
+ const int n_dim = 3;
+ gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
+ LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+ }
+ if (name == "register_tokens" && tensor->ne[2] == 1) {
+ const int n_dim = 3;
+ gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
+ LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+ }
+ }
}
// Set split info if needed
@@ -18647,6 +18810,56 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+ // rules for image models
+ bool image_model = false;
+ if (model.arch == LLM_ARCH_FLUX) {
+ image_model = true;
+ quantize &= name.find("txt_in.") == std::string::npos;
+ quantize &= name.find("img_in.") == std::string::npos;
+ quantize &= name.find("time_in.") == std::string::npos;
+ quantize &= name.find("vector_in.") == std::string::npos;
+ quantize &= name.find("guidance_in.") == std::string::npos;
+ quantize &= name.find("final_layer.") == std::string::npos;
+ }
+ if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
+ image_model = true;
+ quantize &= name.find("class_embedding.") == std::string::npos;
+ quantize &= name.find("time_embedding.") == std::string::npos;
+ quantize &= name.find("add_embedding.") == std::string::npos;
+ quantize &= name.find("time_embed.") == std::string::npos;
+ quantize &= name.find("label_emb.") == std::string::npos;
+ quantize &= name.find("conv_in.") == std::string::npos;
+ quantize &= name.find("conv_out.") == std::string::npos;
+ quantize &= name != "input_blocks.0.0.weight";
+ quantize &= name != "out.2.weight";
+ }
+ if (model.arch == LLM_ARCH_SD3) {
+ image_model = true;
+ quantize &= name.find("final_layer.") == std::string::npos;
+ quantize &= name.find("time_text_embed.") == std::string::npos;
+ quantize &= name.find("context_embedder.") == std::string::npos;
+ quantize &= name.find("t_embedder.") == std::string::npos;
+ quantize &= name.find("y_embedder.") == std::string::npos;
+ quantize &= name.find("x_embedder.") == std::string::npos;
+ quantize &= name != "proj_out.weight";
+ quantize &= name != "pos_embed";
+ }
+ if (model.arch == LLM_ARCH_AURA) {
+ image_model = true;
+ quantize &= name.find("t_embedder.") == std::string::npos;
+ quantize &= name.find("init_x_linear.") == std::string::npos;
+ quantize &= name != "modF.1.weight";
+ quantize &= name != "cond_seq_linear.weight";
+ quantize &= name != "final_linear.weight";
+ quantize &= name != "final_linear.weight";
+ quantize &= name != "positional_encoding";
+ quantize &= name != "register_tokens";
+ }
+ // ignore 3D/4D tensors for image models as the code was never meant to handle these
+ if (image_model) {
+ quantize &= ggml_n_dims(tensor) == 2;
+ }
+
enum ggml_type new_type;
void * new_data;
size_t new_size;
@@ -18655,6 +18868,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_type = default_type;
// get more optimal quantization type based on the tensor shape, layer, etc.
+ if (image_model) {
+ new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
+ } else {
if (!params->pure && ggml_is_quantized(default_type)) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
}
@@ -18664,6 +18880,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
new_type = params->output_tensor_type;
}
+ }
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
|