Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions benchpress/config/benchmarks_ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ adsim:
- target_latency_msec
- target_latency_percentile

pytorch_gemm_gpuless:
parser: pytorch_gemm_gpuless
install_script: ./packages/ai_wdl/pytorch_gemm_gpuless/install_pytorch_gemm_gpuless.sh
cleanup_script: ./packages/ai_wdl/pytorch_gemm_gpuless/cleanup_pytorch_gemm_gpuless.sh
path: ./benchmarks/ai_wdl/pytorch_gemm_gpuless/run.sh
metrics:
- wall_time_per_call_us
- host_overhead_per_call_us
- simulated_tflops

type_conversion:
parser: type_conversion
install_script: ./packages/ai_wdl/type_conversion/install_type_conversion.sh
Expand Down
72 changes: 72 additions & 0 deletions benchpress/config/jobs_ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,75 @@
description: Benchmark for common AI data type conversion
args:
- '--benchmark_format=json'

- benchmark: pytorch_gemm_gpuless
name: pytorch_gemm_gpuless_stage1
description: GPU-less torch.mm dispatch overhead via TorchDispatchMode (no CUDA needed).
args:
- stage1
- '-m {m}'
- '-n {n}'
- '-k {k}'
- '-t {dtype}'
- '--steps {steps}'
- '--warmups {warmups}'
- '--gpu-model {gpu_model}'
- '--efficiency {efficiency}'
- '--no-sleep'
vars:
- 'm=1024'
- 'n=1024'
- 'k=1024'
- 'dtype=bfloat16'
- 'steps=1000000'
- 'warmups=10000'
- 'gpu_model=gb200'
- 'efficiency=0.5'

- benchmark: pytorch_gemm_gpuless
name: pytorch_gemm_gpuless_stage2_nosleep
description: GPU-less torch.mm full host-side overhead via mock_cuda (requires libcuda.so.1).
args:
- stage2
- '-m {m}'
- '-n {n}'
- '-k {k}'
- '-t {dtype}'
- '--steps {steps}'
- '--warmups {warmups}'
- '--gpu-model {gpu_model}'
- '--efficiency {efficiency}'
- '--no-sleep'
vars:
- 'm=1024'
- 'n=1024'
- 'k=1024'
- 'dtype=bfloat16'
- 'steps=1000000'
- 'warmups=10000'
- 'gpu_model=gb200'
- 'efficiency=0.5'

- benchmark: pytorch_gemm_gpuless
name: pytorch_gemm_gpuless_stage2_spin
description: GPU-less torch.mm with spin delay (simulates GPU latency via clock_gettime polling).
args:
- stage2
- '-m {m}'
- '-n {n}'
- '-k {k}'
- '-t {dtype}'
- '--steps {steps}'
- '--warmups {warmups}'
- '--gpu-model {gpu_model}'
- '--efficiency {efficiency}'
- '--delay-mode spin'
vars:
- 'm=1024'
- 'n=1024'
- 'k=1024'
- 'dtype=bfloat16'
- 'steps=1000000'
- 'warmups=10000'
- 'gpu_model=gb200'
- 'efficiency=0.5'
2 changes: 2 additions & 0 deletions benchpress/plugins/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from .multichase_pointer import MultichasePointerParser
from .nginx_wrk_bench import NginxWrkParser
from .nnpi_net4 import NNPINet4Parser
from .pytorch_gemm_gpuless import PytorchGemmGpulessParser
from .rebatch import RebatchParser
from .returncode import ReturncodeParser
from .schbench import SchbenchParser
Expand Down Expand Up @@ -116,6 +117,7 @@ def register_parsers(factory):
factory.register("adsim", AdSimParser)
factory.register("cdn_bench", CDNBenchParser)
factory.register("type_conversion", TypeConversionParser)
factory.register("pytorch_gemm_gpuless", PytorchGemmGpulessParser)
factory.register("xsbench", XSBenchParser)
if not open_source:
factory.register("hackperf", HackperfParser)
Expand Down
49 changes: 49 additions & 0 deletions benchpress/plugins/parsers/pytorch_gemm_gpuless.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-unsafe
import re

from benchpress.lib.parser import Parser


class PytorchGemmGpulessParser(Parser):
"""Parser for pytorch_gemm_gpuless benchmark output.

Extracts metrics from both Stage 1 (TorchDispatchMode) and
Stage 2 (mock_cuda) output formats:
Wall time / call: 13.730 us
Host overhead / call: 13.730 us
Simulated TF/s: 156.440000
"""

def parse(self, stdout, stderr, returncode):
metrics = {}

for line in stdout:
line = line.strip()

m = re.search(r"Wall\s+time\s*/\s*call:\s+([\d.]+)\s*us", line)
if m:
metrics["wall_time_per_call_us"] = float(m.group(1))
continue

m = re.search(r"Host\s+overhead\s*/\s*call:\s+([\d.]+)\s*us", line)
if m:
metrics["host_overhead_per_call_us"] = float(m.group(1))
continue

m = re.search(r"Simulated\s+TF/s:\s+([\d.]+)", line)
if m:
metrics["simulated_tflops"] = float(m.group(1))
continue

m = re.search(r"Simulated\s+GPU\s*/\s*call:\s+([\d.]+)\s*us", line)
if m:
metrics["simulated_gpu_per_call_us"] = float(m.group(1))
continue

return metrics
62 changes: 62 additions & 0 deletions packages/ai_wdl/pytorch_gemm_gpuless/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# pytorch_gemm_gpuless

GPU-less `torch.mm` micro-benchmark that measures host-side dispatch overhead
without requiring a GPU. Designed for analyzing CPU frontend bottlenecks
(BTB/L1I capacity) on Neoverse V2 (GB200/GB300) and AMD Zen4.

## Stages

| Stage | What it Measures | Requirements |
|-------|-----------------|--------------|
| 1 (`TorchDispatchMode`) | Python dispatch overhead | Any machine |
| 2 (`mock_cuda`) | Full host-side overhead (C++ + CUDA driver API) | CUDA drivers (libcuda.so.1) |

Stage 2 requires NVIDIA driver userspace libraries (`cuda-compat` package).
No GPU hardware is needed — only the driver shared library for function table
patching. The install script auto-detects and installs `cuda-compat` if
available via package manager.

## Installation

```bash
./benchpress -b ai install pytorch_gemm_gpuless_stage1
./benchpress -b ai install pytorch_gemm_gpuless_stage2_nosleep
```

The install script will:
- Detect CUDA driver availability
- Install PyTorch CUDA (if drivers present) or PyTorch CPU (if not)
- Build C extensions (nop_delay, mock_cuda) via setuptools
- Stage 2 jobs will error at runtime if CUDA drivers are missing

## Run

```bash
# Stage 1 — pure host dispatch overhead (any machine)
./benchpress -b ai run pytorch_gemm_gpuless_stage1

# Stage 2 — full C++ dispatch overhead (requires CUDA drivers)
./benchpress -b ai run pytorch_gemm_gpuless_stage2_nosleep
./benchpress -b ai run pytorch_gemm_gpuless_stage2_spin
```

## Metrics

| Metric | Unit | Description |
|--------|------|-------------|
| `wall_time_per_call_us` | microseconds | Total wall time per torch.mm call |
| `host_overhead_per_call_us` | microseconds | Host dispatch overhead per call |
| `simulated_tflops` | TF/s | Simulated throughput |

## Sample Output

```json
{
"benchmark_name": "pytorch_gemm_gpuless_stage1",
"metrics": {
"wall_time_per_call_us": 76.196,
"host_overhead_per_call_us": 76.196,
"simulated_tflops": 28.183608
}
}
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

AI_BENCH_ROOT="$(dirname "$(readlink -f "$0")")"
BENCHPRESS_ROOT="$(readlink -f "$AI_BENCH_ROOT/../../..")"
BENCHMARKS_DIR="${BENCHPRESS_ROOT}/benchmarks/ai_wdl/pytorch_gemm_gpuless"

rm -rf "$BENCHMARKS_DIR"
Loading
Loading