Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting
graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
Expand Down
2 changes: 1 addition & 1 deletion examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = {
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
&max_vram},
};

Expand Down
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting
graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
Expand Down
2 changes: 1 addition & 1 deletion include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ typedef struct {
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;
float max_vram;
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
} sd_ctx_params_t;

typedef struct {
Expand Down
55 changes: 55 additions & 0 deletions src/ggml_graph_cut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

namespace sd::ggml_graph_cut {

static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;

static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return "<null>";
Expand Down Expand Up @@ -79,6 +82,58 @@ namespace sd::ggml_graph_cut {
segment.output_bytes;
}

size_t max_vram_gib_to_bytes(float max_vram) {
if (max_vram <= 0.f) {
return 0;
}
return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
}

static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}

static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
return 0;
}

ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}

size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);

if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB);
return 0;
}

const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}

float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
}

static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index,
size_t end_segment_index) {
Expand Down
2 changes: 2 additions & 0 deletions src/ggml_graph_cut.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ namespace sd::ggml_graph_cut {
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc);
size_t max_vram_gib_to_bytes(float max_vram);
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
Expand Down
10 changes: 4 additions & 6 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"

#include "model.h"
#include "rng.hpp"
Expand Down Expand Up @@ -209,6 +210,7 @@ class StableDiffusionGGML {
ggml_log_set(ggml_log_callback_default, nullptr);

init_backend();
max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);

ModelLoader model_loader;

Expand Down Expand Up @@ -426,9 +428,7 @@ class StableDiffusionGGML {

bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;

const size_t max_graph_vram_bytes = max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);

{
clip_backend = backend;
Expand Down Expand Up @@ -3597,9 +3597,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
false,
request.hires.upscale_tile_size);
const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path,
sd_ctx->sd->offload_params_to_cpu,
Expand Down
Loading