leejet · stduhpf · Feb 11, 2025 · Feb 12, 2025 · Mar 13, 2025 · Jun 5, 2025
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -102,9 +102,15 @@ struct SDParams {
     int upscale_repeats           = 1;
 
     std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.f;
+    float slg_scale              = 0.0f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+    bool slg_uncond              = false;
+
+    float apg_eta            = 1.0f;
+    float apg_momentum       = 0.0f;
+    float apg_norm_threshold = 0.0f;
+    float apg_norm_smoothing = 0.0f;
 
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
@@ -204,13 +210,21 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
     printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
     printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
+    printf("  --apg-eta VALUE                    parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n");
+    printf("  --apg-momentum VALUE               Momentum for guidance adjustments with APG (default: 0, recommended: around -0.5 (negative))\n");
+    printf("  --apg-nt VALUE                     APG norm threshold: Upper bound allowed for the amplitude (L2 norm) of guidance updates (default: 0 = disabled, recommended: 4-15)\n");
+    printf("  --apg-nt-smoothing VALUE           EXPERIMENTAL! Norm threshold smoothing for APG, smoothly decrease the amplitude of the guidance update if it gets too close to the norm threshold (default: 0 = disabled)\n");
+    printf("                                     (replaces saturation with a smooth approximation)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
-    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
+    printf("  --slg-uncond                       Use CFG's forward pass for SLG instead of a separate pass, only for DiT models\n");
+    printf("                                     To use this, it's recommended to keep slg-scale to 0, both for performance and quality reasons\n");
+    printf("                                     This should be slightly faster than normal cfg when cfg_scale != 1.\n");
     printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
     printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
     printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
+    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
     printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
@@ -412,7 +426,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--slg-scale", "", &params.slg_scale},
         {"", "--skip-layer-start", "", &params.skip_layer_start},
         {"", "--skip-layer-end", "", &params.skip_layer_end},
-
+        {"", "--apg-eta", "", &params.apg_eta},
+        {"", "--apg-momentum", "", &params.apg_momentum},
+        {"", "--apg-nt", "", &params.apg_norm_threshold},
+        {"", "--apg-nt-smoothing", "", &params.apg_norm_smoothing},
     };
 
     options.bool_options = {
@@ -425,6 +442,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbos", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
+        {"", "--slg-uncond", "", true, &params.slg_uncond},
         {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
         {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
     };
@@ -660,7 +678,20 @@ std::string get_image_params(SDParams params, int64_t seed) {
     }
     parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
     parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    if (params.apg_eta != 1) {
+        parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", ";
+    }
+    if (params.apg_momentum != 0) {
+        parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", ";
+    }
+    if (params.apg_norm_threshold != 0) {
+        parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", ";
+        if (params.apg_norm_smoothing != 0) {
+            parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", ";
+        }
+    }
     if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
+        parameter_string += "Unconditional SLG: " + std::string(params.slg_uncond ? "True" : "False") + ", ";
         parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
         parameter_string += "Skip layers: [";
         for (const auto& layer : params.skip_layers) {
@@ -733,17 +764,25 @@ int main(int argc, const char* argv[]) {
 
     parse_args(argc, argv, params);
 
-    sd_guidance_params_t guidance_params = {params.cfg_scale,
-                                            params.img_cfg_scale,
-                                            params.min_cfg,
-                                            params.guidance,
-                                            {
-                                                params.skip_layers.data(),
-                                                params.skip_layers.size(),
-                                                params.skip_layer_start,
-                                                params.skip_layer_end,
-                                                params.slg_scale,
-                                            }};
+    sd_guidance_params_t guidance_params = {
+        params.cfg_scale,
+        params.img_cfg_scale,
+        params.min_cfg,
+        params.guidance,
+        {
+            params.skip_layers.data(),
+            params.skip_layers.size(),
+            params.skip_layer_start,
+            params.skip_layer_end,
+            params.slg_scale,
+        },
+        {
+            params.apg_eta,
+            params.apg_momentum,
+            params.apg_norm_threshold,
+            params.apg_norm_smoothing,
+        },
+    };
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
 

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -885,7 +885,7 @@ class StableDiffusionGGML {
 
         bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
         bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
+        bool has_skiplayer     = (slg_scale != 0.0 || guidance.slg.uncond) && skip_layers.size() > 0;
 
         // denoise wrapper
         struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
@@ -898,7 +898,9 @@ class StableDiffusionGGML {
         }
         if (has_skiplayer) {
             if (sd_version_is_dit(version)) {
-                out_skip = ggml_dup_tensor(work_ctx, x);
+                if (slg_scale != 0.0) {
+                    out_skip = ggml_dup_tensor(work_ctx, x);
+                }
             } else {
                 has_skiplayer = false;
                 LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
@@ -909,6 +911,10 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
+        std::vector<float> apg_momentum_buffer;
+        if (guidance.apg.momentum != 0)
+            apg_momentum_buffer.resize((size_t)ggml_nelements(denoised));
+
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1) {
                 pretty_progress(0, (int)steps, 0);
@@ -968,6 +974,8 @@ class StableDiffusionGGML {
                                          control_strength,
                                          &out_cond);
             }
+            int step_count         = sigmas.size();
+            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
 
             float* negative_data = NULL;
             if (has_unconditioned) {
@@ -976,18 +984,36 @@ class StableDiffusionGGML {
                     control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                     controls = control_net->controls;
                 }
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         uncond.c_crossattn,
-                                         uncond.c_concat,
-                                         uncond.c_vector,
-                                         guidance_tensor,
-                                         ref_latents,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_uncond);
+                if (is_skiplayer_step && guidance.slg.uncond) {
+                    LOG_DEBUG("Skipping layers at uncond step %d\n", step);
+                    diffusion_model->compute(n_threads,
+                                             noised_input,
+                                             timesteps,
+                                             uncond.c_crossattn,
+                                             uncond.c_concat,
+                                             uncond.c_vector,
+                                             guidance_tensor,
+                                             ref_latents,
+                                             -1,
+                                             controls,
+                                             control_strength,
+                                             &out_uncond,
+                                             NULL,
+                                             skip_layers);
+                } else {
+                    diffusion_model->compute(n_threads,
+                                             noised_input,
+                                             timesteps,
+                                             uncond.c_crossattn,
+                                             uncond.c_concat,
+                                             uncond.c_vector,
+                                             guidance_tensor,
+                                             ref_latents,
+                                             -1,
+                                             controls,
+                                             control_strength,
+                                             &out_uncond);
+                }
                 negative_data = (float*)out_uncond->data;
             }
 
@@ -1008,10 +1034,8 @@ class StableDiffusionGGML {
                 img_cond_data = (float*)out_img_cond->data;
             }
 
-            int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
             float* skip_layer_data = NULL;
-            if (is_skiplayer_step) {
+            if (is_skiplayer_step && slg_scale != 0.0) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
                 // skip layer (same as conditionned)
                 diffusion_model->compute(n_threads,
@@ -1034,6 +1058,87 @@ class StableDiffusionGGML {
             float* vec_input     = (float*)input->data;
             float* positive_data = (float*)out_cond->data;
             int ne_elements      = (int)ggml_nelements(denoised);
+
+            float* deltas = vec_denoised;
+
+            // APG: https://arxiv.org/pdf/2410.02416
+
+            bool log_cfg_norm                 = false;
+            const char* SD_LOG_CFG_DELTA_NORM = getenv("SD_LOG_CFG_DELTA_NORM");
+            if (SD_LOG_CFG_DELTA_NORM != nullptr) {
+                std::string sd_log_cfg_norm_str = SD_LOG_CFG_DELTA_NORM;
+                if (sd_log_cfg_norm_str == "ON" || sd_log_cfg_norm_str == "TRUE") {
+                    log_cfg_norm = true;
+                } else if (sd_log_cfg_norm_str != "OFF" && sd_log_cfg_norm_str != "FALSE") {
+                    LOG_WARN("SD_LOG_CFG_DELTA_NORM environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_LOG_CFG_DELTA_NORM);
+                }
+            }
+            float apg_scale_factor = 1.;
+            float diff_norm        = 0;
+            float cond_norm_sq     = 0;
+            float dot              = 0;
+            if (has_unconditioned || has_img_cond) {
+                for (int i = 0; i < ne_elements; i++) {
+                    float delta;
+                    if (has_img_cond) {
+                        if (cfg_scale == 1) {
+                            // Weird guidance (important: use img_cfg_scale instead of cfg_scale in the final formula)
+                            delta = img_cond_data[i] - negative_data[i];
+                        } else if (has_unconditioned) {
+                            // 2-conditioning CFG (img_cfg_scale != cfg_scale != 1)
+                            delta = positive_data[i] + (negative_data[i] * (1 - img_cfg_scale) + img_cond_data[i] * (img_cfg_scale - cfg_scale)) / (cfg_scale - 1);
+                        } else {
+                            // pure img CFG (img_cfg_scale == 1, cfg_scale !=1)
+                            delta = positive_data[i] - img_cond_data[i];
+                        }
+                    } else {
+                        // classic CFG (img_cfg_scale == cfg_scale != 1)
+                        delta = positive_data[i] - negative_data[i];
+                    }
+                    if (guidance.apg.momentum != 0) {
+                        delta += guidance.apg.momentum * apg_momentum_buffer[i];
+                        apg_momentum_buffer[i] = delta;
+                    }
+                    if (guidance.apg.norm_treshold > 0 || log_cfg_norm) {
+                        diff_norm += delta * delta;
+                    }
+                    if (guidance.apg.eta != 1.0f) {
+                        cond_norm_sq += positive_data[i] * positive_data[i];
+                        dot += positive_data[i] * delta;
+                    }
+                    deltas[i] = delta;
+                }
+                if (log_cfg_norm) {
+                    LOG_INFO("CFG Delta norm: %.2f", sqrtf(diff_norm));
+                }
+                if (guidance.apg.norm_treshold > 0) {
+                    diff_norm = sqrtf(diff_norm);
+                    if (guidance.apg.norm_treshold_smoothing <= 0) {
+                        apg_scale_factor = std::min(1.0f, guidance.apg.norm_treshold / diff_norm);
+                    } else {
+                        // Experimental: smooth saturate
+                        float x          = guidance.apg.norm_treshold / diff_norm;
+                        apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / guidance.apg.norm_treshold_smoothing), guidance.apg.norm_treshold_smoothing);
+                    }
+                }
+                if (guidance.apg.eta != 1.0f) {
+                    dot *= apg_scale_factor;
+                    // pre-normalize (avoids one square root and ne_elements extra divs)
+                    dot /= cond_norm_sq;
+                }
+
+                for (int i = 0; i < ne_elements; i++) {
+                    deltas[i] *= apg_scale_factor;
+                    if (guidance.apg.eta != 1.0f) {
+                        float apg_parallel   = dot * positive_data[i];
+                        float apg_orthogonal = deltas[i] - apg_parallel;
+
+                        // tweak deltas
+                        deltas[i] = apg_orthogonal + guidance.apg.eta * apg_parallel;
+                    }
+                }
+            }
+
             for (int i = 0; i < ne_elements; i++) {
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
@@ -1043,19 +1148,19 @@ class StableDiffusionGGML {
                         int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
                         float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
                     } else {
-                        if (has_img_cond) {
-                            // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-                            latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
-                        } else {
-                            // img_cfg_scale == cfg_scale
-                            latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        float delta = deltas[i];
+
+                        if (cfg_scale != 1) {
+                            latent_result = positive_data[i] + (cfg_scale - 1) * delta;
+                        } else if (has_img_cond) {
+                            latent_result = positive_data[i] + (img_cfg_scale - 1) * delta;
                         }
                     }
                 } else if (has_img_cond) {
                     // img_cfg_scale == 1
                     latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
                 }
-                if (is_skiplayer_step) {
+                if (is_skiplayer_step && slg_scale != 0.0) {
                     latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
                 }
                 // v = latent_result, eps = latent_result
@@ -1096,7 +1201,8 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    ggml_tensor*
+    get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
         ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
         struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);

diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -152,14 +152,23 @@ typedef struct {
     float layer_start;
     float layer_end;
     float scale;
+    bool uncond;
 } sd_slg_params_t;
 
+typedef struct {
+    float eta;
+    float momentum;
+    float norm_treshold;
+    float norm_treshold_smoothing;
+} sd_apg_params_t;
+
 typedef struct {
     float txt_cfg;
     float img_cfg;
     float min_cfg;
     float distilled_guidance;
     sd_slg_params_t slg;
+    sd_apg_params_t apg;
 } sd_guidance_params_t;
 
 typedef struct {