From 9d0a521d8e751834dadb5e49e1699e679b44b844 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Wed, 3 Jun 2026 10:56:14 +0200
Subject: [PATCH] perf: allocate CPU-offloaded params from runtime device
 pinned host buffer

---
 src/ggml_extend.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index fe0fce9f9..9dada344a 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -3017,7 +3017,18 @@ struct GGMLRunner {
             LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
             return true;
         }
-        params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
+        // Pinned host buffer when CPU-offloaded for DMA-direct H2D.
+        ggml_backend_buffer_type_t params_buft = nullptr;
+        if (params_backend != runtime_backend) {
+            ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
+            if (runtime_dev != nullptr) {
+                params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
+            }
+        }
+        if (params_buft == nullptr) {
+            params_buft = ggml_backend_get_default_buffer_type(params_backend);
+        }
+        params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
         if (params_buffer == nullptr) {
             LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
                       get_desc().c_str(),