From 5401fb1c66803cff99d11b8cba8361112582bdca Mon Sep 17 00:00:00 2001 From: fszontagh Date: Tue, 2 Jun 2026 18:55:25 +0200 Subject: [PATCH 1/2] Keep chunk-K residency engaged with runtime LoRA --- src/ggml_extend.hpp | 64 +++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index ef104368a..9f5a9e2ca 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2432,12 +2432,17 @@ struct GGMLRunner { GGML_ASSERT(gf != nullptr); // Keep the plan and resident params under the same live-VRAM cap. + // Add back our own resident buffer so we don't see chunk-K's + // allocation as "taken" VRAM and shrink the budget on every step. size_t effective_budget = max_graph_vram_bytes; if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) { ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { size_t free_vram = 0, total_vram = 0; ggml_backend_dev_memory(dev, &free_vram, &total_vram); + if (resident_runtime_params_buffer != nullptr) { + free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer); + } constexpr size_t safety_margin = 512ull * 1024 * 1024; size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; if (free_clamp < effective_budget) { @@ -2815,39 +2820,36 @@ struct GGMLRunner { bool no_return = false) { GGML_ASSERT(gf != nullptr); - // Runtime LoRA mutates CPU weights between calls, so resident GPU - // copies would go stale. - if (weight_adapter != nullptr) { - restore_resident_params(); - } else { - sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; - if (base_plan.available) { - sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes); - - std::vector resident_params; - uint64_t token = 0; - for (const auto& segment : base_plan.segments) { - if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) { + // Runtime LoRA composes `weight + diff` in the compute graph via + // ggml_add; the resident weight tensor's data is never mutated, so + // chunk-K residency stays valid across sampling steps. + sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; + if (base_plan.available) { + sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes); + + std::vector resident_params; + uint64_t token = 0; + for (const auto& segment : base_plan.segments) { + if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) { + continue; + } + auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); + for (ggml_tensor* t : seg_params) { + if (t == nullptr) continue; - } - auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); - for (ggml_tensor* t : seg_params) { - if (t == nullptr) - continue; - resident_params.push_back(t); - token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; - } + resident_params.push_back(t); + token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; } - if (token != resident_state_token) { - restore_resident_params(); - if (!resident_params.empty()) { - if (offload_resident_params(resident_params)) { - resident_state_token = token; - } else { - LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming", - get_desc().c_str()); - restore_resident_params(); - } + } + if (token != resident_state_token) { + restore_resident_params(); + if (!resident_params.empty()) { + if (offload_resident_params(resident_params)) { + resident_state_token = token; + } else { + LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming", + get_desc().c_str()); + restore_resident_params(); } } } From 7bc4b7121c6ccda28c5a5710a683db5959204df3 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Wed, 3 Jun 2026 13:46:15 +0200 Subject: [PATCH 2/2] fix: reserve worst merged segment from chunk-K residency budget --- src/ggml_extend.hpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 9f5a9e2ca..fe0fce9f9 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2823,9 +2823,27 @@ struct GGMLRunner { // Runtime LoRA composes `weight + diff` in the compute graph via // ggml_add; the resident weight tensor's data is never mutated, so // chunk-K residency stays valid across sampling steps. + // Reserve room for the worst merged segment so chunk-K can't grow + // large enough to starve later partial-param allocations. + size_t worst_merged_segment_footprint = 0; + for (const auto& seg : plan.segments) { + const size_t fp = seg.input_param_bytes + + seg.compute_buffer_size + + seg.output_bytes + + seg.input_previous_cut_bytes + + seg.input_external_bytes; + if (fp > worst_merged_segment_footprint) { + worst_merged_segment_footprint = fp; + } + } + const size_t residency_budget_for_annotate = + residency_budget_bytes > worst_merged_segment_footprint + ? residency_budget_bytes - worst_merged_segment_footprint + : 0; + sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; if (base_plan.available) { - sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes); + sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate); std::vector resident_params; uint64_t token = 0;