From c556418b600ad5792440942079d93e393595688b Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 28 Jul 2025 18:59:04 +0300 Subject: [PATCH] llama-bench : use local GPUs along with RPC servers (#14917) Currently if RPC servers are specified with '--rpc' and there is a local GPU available (e.g. CUDA), the benchmark will be performed only on the RPC device(s) but the backend result column will say "CUDA,RPC" which is incorrect. This patch is adding all local GPU devices and makes llama-bench consistent with llama-cli. --- tools/llama-bench/llama-bench.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b80e984d0..c56834a2a 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -950,6 +950,7 @@ struct cmd_params_instance { } static std::vector devices; devices.clear(); + // RPC devices should always come first for performance reasons for (const std::string & server : rpc_servers) { ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); if (dev) { @@ -959,6 +960,20 @@ struct cmd_params_instance { exit(1); } } + // add local GPU devices if any + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + switch (ggml_backend_dev_type(dev)) { + case GGML_BACKEND_DEVICE_TYPE_CPU: + case GGML_BACKEND_DEVICE_TYPE_ACCEL: + // skip CPU backends since they are handled separately + break; + + case GGML_BACKEND_DEVICE_TYPE_GPU: + devices.push_back(dev); + break; + } + } devices.push_back(nullptr); mparams.devices = devices.data(); }