File size: 14,709 Bytes
3483284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index de3c706f..0267c1fa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -223,7 +223,7 @@
 #define GGML_MAX_OP_PARAMS      64
 
 #ifndef GGML_MAX_NAME
-#   define GGML_MAX_NAME        64
+#   define GGML_MAX_NAME        128
 #endif
 
 #define GGML_DEFAULT_N_THREADS  4
@@ -2449,6 +2449,7 @@ extern "C" {
 
     // manage tensor info
     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
     GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
     GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b16c462f..6d1568f1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
     ctx->header.n_tensors++;
 }
 
+void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ABORT("tensor not found");
+    }
+    ctx->infos[idx].n_dims = n_dim;
+}
+
 void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
     const int idx = gguf_find_tensor(ctx, name);
     if (idx < 0) {
diff --git a/src/llama.cpp b/src/llama.cpp
index 24e1f1f0..aeccc173 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -205,6 +205,11 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
+    LLM_ARCH_FLUX,
+    LLM_ARCH_SD1,
+    LLM_ARCH_SDXL,
+    LLM_ARCH_SD3,
+    LLM_ARCH_AURA,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -258,6 +263,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
+    { LLM_ARCH_FLUX,            "flux"         },
+    { LLM_ARCH_SD1,             "sd1"          },
+    { LLM_ARCH_SDXL,            "sdxl"         },
+    { LLM_ARCH_SD3,             "sd3"          },
+    { LLM_ARCH_AURA,            "aura"         },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -1531,6 +1541,11 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
+    { LLM_ARCH_FLUX, {}},
+    { LLM_ARCH_SD1,  {}},
+    { LLM_ARCH_SDXL, {}},
+    { LLM_ARCH_SD3,  {}},
+    { LLM_ARCH_AURA, {}},
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -5403,6 +5418,12 @@ static void llm_load_hparams(
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
+    // Disable LLM metadata for image models
+    if (model.arch == LLM_ARCH_FLUX || model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL || model.arch == LLM_ARCH_SD3 || model.arch == LLM_ARCH_AURA) {
+        model.ftype = ml.ftype;
+        return;
+    }
+
     // get hparams kv
     ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
 
@@ -18016,6 +18037,125 @@ static void llama_tensor_dequantize_internal(
     workers.clear();
 }
 
+static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+    // Special function for quantizing image model tensors
+    const std::string name = ggml_get_name(tensor);
+    const llm_arch arch = qs.model.arch;
+
+    // Sanity check
+    if (
+            (name.find("model.diffusion_model.") != std::string::npos) ||
+            (name.find("first_stage_model.") != std::string::npos) ||
+            (name.find("single_transformer_blocks.") != std::string::npos) ||
+            (name.find("joint_transformer_blocks.") != std::string::npos)
+        ) {
+            throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
+    }
+
+    // Unsupported quant types - exclude all IQ quants for now
+    if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_M   || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
+        ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
+        throw std::runtime_error("Invalid quantization type for image model (Not supported)");
+    }
+
+    if ( // Rules for to_v attention
+            (name.find("attn_v.weight") != std::string::npos) ||
+            (name.find(".to_v.weight") != std::string::npos) || 
+            (name.find(".attn.w1v.weight") != std::string::npos) ||
+            (name.find(".attn.w2v.weight") != std::string::npos)
+        ){
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+                new_type = GGML_TYPE_Q3_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+                new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            ++qs.i_attention_wv;
+    } else if ( // Rules for fused qkv attention
+            (name.find("attn_qkv.weight") != std::string::npos) ||
+            (name.find("attn.qkv.weight") != std::string::npos)
+        ) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+    } else if ( // Rules for ffn
+            (name.find("ffn_down") != std::string::npos)
+        ) {
+            // TODO: add back `layer_info` with some model specific logic + logic further down
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
+                new_type = GGML_TYPE_Q4_1;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
+                new_type = GGML_TYPE_Q5_1;
+            }
+            ++qs.i_ffn_down;
+    }
+
+    // Sanity check for row shape
+    bool convert_incompatible_tensor = false;
+    if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   ||
+        new_type == GGML_TYPE_Q5_K    || new_type == GGML_TYPE_Q6_K) {
+        int nx = tensor->ne[0];
+        int ny = tensor->ne[1];
+        if (nx % QK_K != 0) {
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+            convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
+        }
+    }
+    if (convert_incompatible_tensor) {
+        // TODO: Possibly reenable this in the future
+        // switch (new_type) {
+        //     case GGML_TYPE_Q2_K:
+        //     case GGML_TYPE_Q3_K:
+        //     case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+        //     case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+        //     case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+        //     default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+        // }
+        new_type = GGML_TYPE_F16;
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
+    }
+    return new_type;
+}
+
 static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
@@ -18547,6 +18687,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             ctx_outs[i_split] = gguf_init_empty();
         }
         gguf_add_tensor(ctx_outs[i_split], tensor);
+        // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
+        if (model.arch == LLM_ARCH_SD3) {
+            const std::string name = ggml_get_name(tensor);
+            if (name == "pos_embed" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
+            }
+        }
+        // same goes for auraflow
+        if (model.arch == LLM_ARCH_AURA) {
+            const std::string name = ggml_get_name(tensor);
+            if (name == "positional_encoding" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+            }
+            if (name == "register_tokens" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+            }
+        }
     }
 
     // Set split info if needed
@@ -18647,6 +18810,56 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
+        // rules for image models
+        bool image_model = false;
+        if (model.arch == LLM_ARCH_FLUX) {
+            image_model = true;
+            quantize &= name.find("txt_in.") == std::string::npos;
+            quantize &= name.find("img_in.") == std::string::npos;
+            quantize &= name.find("time_in.") == std::string::npos;
+            quantize &= name.find("vector_in.") == std::string::npos;
+            quantize &= name.find("guidance_in.") == std::string::npos;
+            quantize &= name.find("final_layer.") == std::string::npos;
+        }
+        if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
+            image_model = true;
+            quantize &= name.find("class_embedding.") == std::string::npos;
+            quantize &= name.find("time_embedding.") == std::string::npos;
+            quantize &= name.find("add_embedding.") == std::string::npos;
+            quantize &= name.find("time_embed.") == std::string::npos;
+            quantize &= name.find("label_emb.") == std::string::npos;
+            quantize &= name.find("conv_in.") == std::string::npos;
+            quantize &= name.find("conv_out.") == std::string::npos;
+            quantize &= name != "input_blocks.0.0.weight";
+            quantize &= name != "out.2.weight";
+        }
+        if (model.arch == LLM_ARCH_SD3) {
+            image_model = true;
+            quantize &= name.find("final_layer.") == std::string::npos;
+            quantize &= name.find("time_text_embed.") == std::string::npos;
+            quantize &= name.find("context_embedder.") == std::string::npos;
+            quantize &= name.find("t_embedder.") == std::string::npos;
+            quantize &= name.find("y_embedder.") == std::string::npos;
+            quantize &= name.find("x_embedder.") == std::string::npos;
+            quantize &= name != "proj_out.weight";
+            quantize &= name != "pos_embed";
+        }
+        if (model.arch == LLM_ARCH_AURA) {
+            image_model = true;
+            quantize &= name.find("t_embedder.") == std::string::npos;
+            quantize &= name.find("init_x_linear.") == std::string::npos;
+            quantize &= name != "modF.1.weight";
+            quantize &= name != "cond_seq_linear.weight";
+            quantize &= name != "final_linear.weight";
+            quantize &= name != "final_linear.weight";
+            quantize &= name != "positional_encoding";
+            quantize &= name != "register_tokens";
+        }
+        // ignore 3D/4D tensors for image models as the code was never meant to handle these
+        if (image_model) {
+            quantize &= ggml_n_dims(tensor) == 2;
+        }
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;
@@ -18655,6 +18868,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
+            if (image_model) {
+                new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
+            } else {
             if (!params->pure && ggml_is_quantized(default_type)) {
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
             }
@@ -18664,6 +18880,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }
+            }
 
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.