leejet · leejet · May 16, 2026 · May 16, 2026
diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -55,7 +55,7 @@ Context Options:
                                            then threads will be set to the number of CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting
+                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -397,7 +397,7 @@ ArgOptions SDContextParams::get_options() {
     options.float_options = {
         {"",
          "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
          &max_vram},
     };
 

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -157,7 +157,7 @@ Context Options:
                                            then threads will be set to the number of CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting
+                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -205,7 +205,7 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
-    float max_vram;
+    float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
 } sd_ctx_params_t;
 
 typedef struct {

diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
@@ -16,6 +16,9 @@
 
 namespace sd::ggml_graph_cut {
 
+    static constexpr double MAX_VRAM_BYTES_PER_GIB      = 1024.0 * 1024.0 * 1024.0;
+    static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
+
     static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
         if (tensor == nullptr) {
             return "<null>";
@@ -79,6 +82,58 @@ namespace sd::ggml_graph_cut {
                segment.output_bytes;
     }
 
+    size_t max_vram_gib_to_bytes(float max_vram) {
+        if (max_vram <= 0.f) {
+            return 0;
+        }
+        return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
+    }
+
+    static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
+        return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
+    }
+
+    static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
+        if (backend == nullptr) {
+            LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
+            return 0;
+        }
+
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        if (dev == nullptr) {
+            LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
+            return 0;
+        }
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
+            return 0;
+        }
+
+        size_t free_vram  = 0;
+        size_t total_vram = 0;
+        ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+
+        if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
+            LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
+                     free_vram / MAX_VRAM_BYTES_PER_GIB);
+            return 0;
+        }
+
+        const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
+        LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
+                 free_vram / MAX_VRAM_BYTES_PER_GIB,
+                 total_vram / MAX_VRAM_BYTES_PER_GIB,
+                 max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
+        return max_vram_bytes;
+    }
+
+    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
+        if (max_vram != -1.f) {
+            return max_vram;
+        }
+        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
+    }
+
     static Segment make_segment_seed(const Plan& plan,
                                      size_t start_segment_index,
                                      size_t end_segment_index) {

diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h
@@ -83,6 +83,8 @@ namespace sd::ggml_graph_cut {
                                           ggml_cgraph* gf,
                                           const Segment& segment,
                                           const char* log_desc);
+    size_t max_vram_gib_to_bytes(float max_vram);
+    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
     Plan build_plan(ggml_backend_t backend,
                     ggml_cgraph* gf,
                     const std::unordered_set<const ggml_tensor*>& params_tensor_set,

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -1,4 +1,5 @@
 #include "ggml_extend.hpp"
+#include "ggml_graph_cut.h"
 
 #include "model.h"
 #include "rng.hpp"
@@ -209,6 +210,7 @@ class StableDiffusionGGML {
         ggml_log_set(ggml_log_callback_default, nullptr);
 
         init_backend();
+        max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);
 
         ModelLoader model_loader;
 
@@ -426,9 +428,7 @@ class StableDiffusionGGML {
 
         bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
 
-        const size_t max_graph_vram_bytes = max_vram <= 0.f
-                                                ? 0
-                                                : static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
+        const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);
 
         {
             clip_backend = backend;
@@ -3597,9 +3597,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
             hires_upscaler                    = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
                                                             false,
                                                             request.hires.upscale_tile_size);
-            const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f
-                                                    ? 0
-                                                    : static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
+            const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
             hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
             if (!hires_upscaler->load_from_file(request.hires.model_path,
                                                 sd_ctx->sd->offload_params_to_cpu,