facebookresearch · excelle08 · Apr 13, 2026
diff --git a/benchpress/config/benchmarks_ai.yml b/benchpress/config/benchmarks_ai.yml
@@ -65,6 +65,16 @@ adsim:
     - target_latency_msec
     - target_latency_percentile
 
+pytorch_gemm_gpuless:
+  parser: pytorch_gemm_gpuless
+  install_script: ./packages/ai_wdl/pytorch_gemm_gpuless/install_pytorch_gemm_gpuless.sh
+  cleanup_script: ./packages/ai_wdl/pytorch_gemm_gpuless/cleanup_pytorch_gemm_gpuless.sh
+  path: ./benchmarks/ai_wdl/pytorch_gemm_gpuless/run.sh
+  metrics:
+    - wall_time_per_call_us
+    - host_overhead_per_call_us
+    - simulated_tflops
+
 type_conversion:
   parser: type_conversion
   install_script: ./packages/ai_wdl/type_conversion/install_type_conversion.sh

diff --git a/benchpress/config/jobs_ai.yml b/benchpress/config/jobs_ai.yml
@@ -340,3 +340,75 @@
   description: Benchmark for common AI data type conversion
   args:
     - '--benchmark_format=json'
+
+- benchmark: pytorch_gemm_gpuless
+  name: pytorch_gemm_gpuless_stage1
+  description: GPU-less torch.mm dispatch overhead via TorchDispatchMode (no CUDA needed).
+  args:
+    - stage1
+    - '-m {m}'
+    - '-n {n}'
+    - '-k {k}'
+    - '-t {dtype}'
+    - '--steps {steps}'
+    - '--warmups {warmups}'
+    - '--gpu-model {gpu_model}'
+    - '--efficiency {efficiency}'
+    - '--no-sleep'
+  vars:
+    - 'm=1024'
+    - 'n=1024'
+    - 'k=1024'
+    - 'dtype=bfloat16'
+    - 'steps=1000000'
+    - 'warmups=10000'
+    - 'gpu_model=gb200'
+    - 'efficiency=0.5'
+
+- benchmark: pytorch_gemm_gpuless
+  name: pytorch_gemm_gpuless_stage2_nosleep
+  description: GPU-less torch.mm full host-side overhead via mock_cuda (requires libcuda.so.1).
+  args:
+    - stage2
+    - '-m {m}'
+    - '-n {n}'
+    - '-k {k}'
+    - '-t {dtype}'
+    - '--steps {steps}'
+    - '--warmups {warmups}'
+    - '--gpu-model {gpu_model}'
+    - '--efficiency {efficiency}'
+    - '--no-sleep'
+  vars:
+    - 'm=1024'
+    - 'n=1024'
+    - 'k=1024'
+    - 'dtype=bfloat16'
+    - 'steps=1000000'
+    - 'warmups=10000'
+    - 'gpu_model=gb200'
+    - 'efficiency=0.5'
+
+- benchmark: pytorch_gemm_gpuless
+  name: pytorch_gemm_gpuless_stage2_spin
+  description: GPU-less torch.mm with spin delay (simulates GPU latency via clock_gettime polling).
+  args:
+    - stage2
+    - '-m {m}'
+    - '-n {n}'
+    - '-k {k}'
+    - '-t {dtype}'
+    - '--steps {steps}'
+    - '--warmups {warmups}'
+    - '--gpu-model {gpu_model}'
+    - '--efficiency {efficiency}'
+    - '--delay-mode spin'
+  vars:
+    - 'm=1024'
+    - 'n=1024'
+    - 'k=1024'
+    - 'dtype=bfloat16'
+    - 'steps=1000000'
+    - 'warmups=10000'
+    - 'gpu_model=gb200'
+    - 'efficiency=0.5'
diff --git a/benchpress/plugins/parsers/__init__.py b/benchpress/plugins/parsers/__init__.py
@@ -42,6 +42,7 @@
 from .multichase_pointer import MultichasePointerParser
 from .nginx_wrk_bench import NginxWrkParser
 from .nnpi_net4 import NNPINet4Parser
+from .pytorch_gemm_gpuless import PytorchGemmGpulessParser
 from .rebatch import RebatchParser
 from .returncode import ReturncodeParser
 from .schbench import SchbenchParser
@@ -116,6 +117,7 @@ def register_parsers(factory):
     factory.register("adsim", AdSimParser)
     factory.register("cdn_bench", CDNBenchParser)
     factory.register("type_conversion", TypeConversionParser)
+    factory.register("pytorch_gemm_gpuless", PytorchGemmGpulessParser)
     factory.register("xsbench", XSBenchParser)
     if not open_source:
         factory.register("hackperf", HackperfParser)

diff --git a/benchpress/plugins/parsers/pytorch_gemm_gpuless.py b/benchpress/plugins/parsers/pytorch_gemm_gpuless.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+import re
+
+from benchpress.lib.parser import Parser
+
+
+class PytorchGemmGpulessParser(Parser):
+    """Parser for pytorch_gemm_gpuless benchmark output.
+
+    Extracts metrics from both Stage 1 (TorchDispatchMode) and
+    Stage 2 (mock_cuda) output formats:
+      Wall time / call:       13.730 us
+      Host overhead / call:   13.730 us
+      Simulated TF/s:         156.440000
+    """
+
+    def parse(self, stdout, stderr, returncode):
+        metrics = {}
+
+        for line in stdout:
+            line = line.strip()
+
+            m = re.search(r"Wall\s+time\s*/\s*call:\s+([\d.]+)\s*us", line)
+            if m:
+                metrics["wall_time_per_call_us"] = float(m.group(1))
+                continue
+
+            m = re.search(r"Host\s+overhead\s*/\s*call:\s+([\d.]+)\s*us", line)
+            if m:
+                metrics["host_overhead_per_call_us"] = float(m.group(1))
+                continue
+
+            m = re.search(r"Simulated\s+TF/s:\s+([\d.]+)", line)
+            if m:
+                metrics["simulated_tflops"] = float(m.group(1))
+                continue
+
+            m = re.search(r"Simulated\s+GPU\s*/\s*call:\s+([\d.]+)\s*us", line)
+            if m:
+                metrics["simulated_gpu_per_call_us"] = float(m.group(1))
+                continue
+
+        return metrics
diff --git a/packages/ai_wdl/pytorch_gemm_gpuless/README.md b/packages/ai_wdl/pytorch_gemm_gpuless/README.md
@@ -0,0 +1,62 @@
+# pytorch_gemm_gpuless
+
+GPU-less `torch.mm` micro-benchmark that measures host-side dispatch overhead
+without requiring a GPU. Designed for analyzing CPU frontend bottlenecks
+(BTB/L1I capacity) on Neoverse V2 (GB200/GB300) and AMD Zen4.
+
+## Stages
+
+| Stage | What it Measures | Requirements |
+|-------|-----------------|--------------|
+| 1 (`TorchDispatchMode`) | Python dispatch overhead | Any machine |
+| 2 (`mock_cuda`) | Full host-side overhead (C++ + CUDA driver API) | CUDA drivers (libcuda.so.1) |
+
+Stage 2 requires NVIDIA driver userspace libraries (`cuda-compat` package).
+No GPU hardware is needed — only the driver shared library for function table
+patching. The install script auto-detects and installs `cuda-compat` if
+available via package manager.
+
+## Installation
+
+```bash
+./benchpress -b ai install pytorch_gemm_gpuless_stage1
+./benchpress -b ai install pytorch_gemm_gpuless_stage2_nosleep
+```
+
+The install script will:
+- Detect CUDA driver availability
+- Install PyTorch CUDA (if drivers present) or PyTorch CPU (if not)
+- Build C extensions (nop_delay, mock_cuda) via setuptools
+- Stage 2 jobs will error at runtime if CUDA drivers are missing
+
+## Run
+
+```bash
+# Stage 1 — pure host dispatch overhead (any machine)
+./benchpress -b ai run pytorch_gemm_gpuless_stage1
+
+# Stage 2 — full C++ dispatch overhead (requires CUDA drivers)
+./benchpress -b ai run pytorch_gemm_gpuless_stage2_nosleep
+./benchpress -b ai run pytorch_gemm_gpuless_stage2_spin
+```
+
+## Metrics
+
+| Metric | Unit | Description |
+|--------|------|-------------|
+| `wall_time_per_call_us` | microseconds | Total wall time per torch.mm call |
+| `host_overhead_per_call_us` | microseconds | Host dispatch overhead per call |
+| `simulated_tflops` | TF/s | Simulated throughput |
+
+## Sample Output
+
+```json
+{
+  "benchmark_name": "pytorch_gemm_gpuless_stage1",
+  "metrics": {
+    "wall_time_per_call_us": 76.196,
+    "host_overhead_per_call_us": 76.196,
+    "simulated_tflops": 28.183608
+  }
+}
+```
diff --git a/packages/ai_wdl/pytorch_gemm_gpuless/cleanup_pytorch_gemm_gpuless.sh b/packages/ai_wdl/pytorch_gemm_gpuless/cleanup_pytorch_gemm_gpuless.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+AI_BENCH_ROOT="$(dirname "$(readlink -f "$0")")"
+BENCHPRESS_ROOT="$(readlink -f "$AI_BENCH_ROOT/../../..")"
+BENCHMARKS_DIR="${BENCHPRESS_ROOT}/benchmarks/ai_wdl/pytorch_gemm_gpuless"
+
+rm -rf "$BENCHMARKS_DIR"