From 9d0a521d8e751834dadb5e49e1699e679b44b844 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Wed, 3 Jun 2026 10:56:14 +0200 Subject: [PATCH] perf: allocate CPU-offloaded params from runtime device pinned host buffer --- src/ggml_extend.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index fe0fce9f9..9dada344a 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -3017,7 +3017,18 @@ struct GGMLRunner { LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); return true; } - params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); + // Pinned host buffer when CPU-offloaded for DMA-direct H2D. + ggml_backend_buffer_type_t params_buft = nullptr; + if (params_backend != runtime_backend) { + ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend); + if (runtime_dev != nullptr) { + params_buft = ggml_backend_dev_host_buffer_type(runtime_dev); + } + } + if (params_buft == nullptr) { + params_buft = ggml_backend_get_default_buffer_type(params_backend); + } + params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft); if (params_buffer == nullptr) { LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", get_desc().c_str(),