diff --git a/.github/workflows/build-rocm-wheels.yml b/.github/workflows/build-rocm-wheels.yml index edce8bf38213..46012b99614c 100644 --- a/.github/workflows/build-rocm-wheels.yml +++ b/.github/workflows/build-rocm-wheels.yml @@ -15,12 +15,12 @@ on: required: false rocm_arch: description: 'ROCm architecture (e.g., gfx1151, gfx1150;gfx1151)' - default: 'gfx1103;gfx1150;gfx1151' + default: 'gfx1151' # TODO: restore to 'gfx1103;gfx1150;gfx1151' before merging required: false env: PYTORCH_INDEX_URL: ${{ github.event.inputs.pytorch_index || 'https://rocm.nightlies.amd.com/v2-staging/gfx1151' }} - PYTORCH_ROCM_ARCH: ${{ github.event.inputs.rocm_arch || 'gfx1103;gfx1150;gfx1151' }} + PYTORCH_ROCM_ARCH: ${{ github.event.inputs.rocm_arch || 'gfx1151' }} # TODO: restore to 'gfx1103;gfx1150;gfx1151' before merging CI_IMAGE: ghcr.io/rocm/vllm/gfx11-ci:latest jobs: @@ -186,6 +186,22 @@ jobs: else echo "amd-smi not found" fi + echo "=== GPU clocks, power profile, and temperature ===" + for card in /sys/class/drm/card[0-9]/device /sys/class/drm/card[0-9][0-9]/device; do + [ -d "$card" ] || continue + echo "--- ${card} ---" + for f in "$card"/hwmon/hwmon*/temp*_input; do + [ -f "$f" ] || continue + echo "${f#"$card"/}: $(( $(cat "$f") / 1000 )) °C" + done + f=power_dpm_force_performance_level + [ -f "$card/$f" ] && echo "$f: $(cat "$card/$f")" + for f in pp_dpm_mclk pp_dpm_sclk pp_power_profile_mode; do + [ -f "$card/$f" ] || continue + echo "$f:" + head -20 "$card/$f" | sed 's/^/ /' + done + done - name: Install wheel and test dependencies run: | @@ -233,7 +249,8 @@ jobs: tests/kernels/quantization/test_hybrid_w4a16_triton.py \ tests/kernels/quantization/test_rocm_compressed_tensors_w4a16.py \ tests/kernels/quantization/test_rocm_skinny_gemms.py \ - tests/quantization/test_hip_w4a16_kernel.py + tests/quantization/test_hip_w4a16_kernel.py \ + tests/kernels/quantization/test_hybrid_w4a16_perf.py upload-wheel: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 134bbc5cc893..adcb9f4b191f 100644 --- a/.gitignore +++ b/.gitignore @@ -245,3 +245,6 @@ vllm/grpc/vllm_engine_pb2.pyi # Ignore generated cpu headers csrc/cpu/cpu_attn_dispatch_generated.h + +# Measured performance baselines (never committed) +tests/kernels/quantization/measured/ diff --git a/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py b/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py deleted file mode 100644 index 8336c1e7c94d..000000000000 --- a/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py +++ /dev/null @@ -1,188 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Benchmark the HybridW4A16LinearKernel across decode and prefill shapes. - -Usage: - python benchmark_int4_gemm.py - python benchmark_int4_gemm.py --models Qwen/Qwen3-4B - python benchmark_int4_gemm.py --group-size 128 -""" - -import argparse -import copy -import itertools -import os - -import torch - -from vllm.triton_utils import triton - -# --------------------------------------------------------------------------- -# Weight shapes: [K, N], TP_SPLIT_DIM -# --------------------------------------------------------------------------- -WEIGHT_SHAPES = { - "Qwen/Qwen3-4B": [ - ([2560, 3840], 1), # qkv_proj - ([2560, 2560], 0), # o_proj - ([2560, 19456], 1), # gate_up_proj - ([9728, 2560], 0), # down_proj - ], - "Qwen/Qwen2.5-7B-Instruct": [ - ([3584, 4608], 1), - ([3584, 3584], 0), - ([3584, 37888], 1), - ([18944, 3584], 0), - ], -} - - -# --------------------------------------------------------------------------- -# Weight packing -# --------------------------------------------------------------------------- -def prepare_hybrid_weights(K, N, group_size, device="cuda"): - """Create random weights for benchmarking. - - Returns (w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32). - """ - num_groups = K // group_size - - # Random packed weights — actual values don't matter for throughput - w_q_skinny_i32 = torch.randint( - 0, 2**31, (N, K // 8), dtype=torch.int32, device=device - ) - w_q_skinny = w_q_skinny_i32.view(torch.int8).contiguous() - w_s_skinny = torch.randn(N, num_groups, dtype=torch.float16, device=device) * 0.01 - - # Raw per-group zero-points for asymmetric benchmarks - w_zp = torch.randint(0, 16, (N, num_groups), dtype=torch.int32, device=device).to( - torch.float16 - ) - - # FP16 baseline for F.linear - w_fp16 = torch.randn(N, K, dtype=torch.float16, device=device) * 0.01 - - return w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32, w_zp - - -# --------------------------------------------------------------------------- -# Benchmark -# --------------------------------------------------------------------------- -PROVIDERS = ["torch-fp16", "hybrid-w4a16", "hybrid-w4a16-zp"] - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size"], - x_vals=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], - x_log=False, - line_arg="provider", - line_vals=PROVIDERS, - line_names=PROVIDERS, - ylabel="TFLOP/s (larger is better)", - plot_name="FP16 vs Hybrid W4A16", - args={}, - ) -) -def benchmark(batch_size, provider, N, K, group_size, weights): - M = batch_size - device = "cuda" - dtype = torch.float16 - a = torch.randn((M, K), device=device, dtype=dtype) - - quantiles = [0.5, 0.2, 0.8] - - if provider == "torch-fp16": - w_fp16 = weights["w_fp16"] - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: torch.nn.functional.linear(a, w_fp16), - quantiles=quantiles, - ) - elif provider in ("hybrid-w4a16", "hybrid-w4a16-zp"): - from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import ( - _hybrid_w4a16_apply_impl, - ) - from vllm.utils.platform_utils import num_compute_units - - w = weights - cu_count = num_compute_units() - use_zp = provider == "hybrid-w4a16-zp" - - def run(): - return _hybrid_w4a16_apply_impl( - a, - w["w_q_skinny"], - w["w_s_skinny"], - w["w_q_skinny_i32"], - w["w_zp"] if use_zp else None, - None, # bias - cu_count, - group_size, - ) - - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - run, - quantiles=quantiles, - ) - else: - return 0.0, 0.0, 0.0 - - to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) - return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) - - -def prepare_shapes(args): - KN_model_names = [] - for model, tp_size in itertools.product(args.models, args.tp_sizes): - for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_dim] //= tp_size - KN.append(model) - KN_model_names.append(KN) - return KN_model_names - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark HybridW4A16LinearKernel") - parser.add_argument( - "--models", - nargs="+", - type=str, - default=["Qwen/Qwen3-4B"], - choices=list(WEIGHT_SHAPES.keys()), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) - parser.add_argument("--group-size", type=int, default=128) - parser.add_argument("--save-path", type=str, default=None) - args = parser.parse_args() - - for K, N, model in prepare_shapes(args): - group_size = args.group_size - print(f"\n{'=' * 70}") - print(f"{model}, N={N} K={K}, group_size={group_size}") - print(f"{'=' * 70}") - - w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32, w_zp = prepare_hybrid_weights( - K, N, group_size - ) - - weights = { - "w_q_skinny": w_q_skinny, - "w_s_skinny": w_s_skinny, - "w_fp16": w_fp16, - "w_q_skinny_i32": w_q_skinny_i32, - "w_zp": w_zp, - } - - save_path = args.save_path or f"bench_int4_res_n{N}_k{K}" - os.makedirs(save_path, exist_ok=True) - benchmark.run( - print_data=True, - show_plots=False, - save_path=save_path, - N=N, - K=K, - group_size=group_size, - weights=weights, - ) - - print("\nBenchmark finished!") diff --git a/pyproject.toml b/pyproject.toml index f37d5ce02375..51818d98a6e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,7 @@ markers = [ "split: run this test as part of a split", "distributed: run this test only in distributed GPU tests", "optional: optional tests that are automatically skipped, include --optional to run them", + "benchmark: performance regression tests", ] [tool.ty.src] diff --git a/tests/conftest.py b/tests/conftest.py index 49400361c229..6c3b676ca0ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1439,6 +1439,18 @@ def pytest_addoption(parser): default=1000 + secrets.randbelow(9000), help="random seed for tests that opt in", ) + parser.addoption( + "--measure-baselines", + action="store_true", + default=False, + help="record performance measurements instead of asserting", + ) + parser.addoption( + "--intermittent", + action="store_true", + default=False, + help="include intermittent (noisy) benchmark cases", + ) def pytest_report_header(config): diff --git a/tests/kernels/quantization/conftest.py b/tests/kernels/quantization/conftest.py new file mode 100644 index 000000000000..1feba0e05894 --- /dev/null +++ b/tests/kernels/quantization/conftest.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""conftest for quantization kernel tests. + +Adds session-finish hook that writes measured results to the ``measured/`` +directory when ``--measure-baselines`` is set. +""" + +from __future__ import annotations + +import json +import pathlib +import shutil + +import pytest + +_HERE = pathlib.Path(__file__).resolve().parent +_MEASURED_DIR = _HERE / "measured" + +# Attribute name on config for the measured-results dict. +_ATTR = "_hybrid_w4a16_measured_results" +_TEMP_ATTR = "_hybrid_w4a16_temp_log" + + +def get_measured_results(config: pytest.Config) -> dict[str, list[dict]]: + """Return (creating if needed) the session-scoped measurement dict.""" + d = getattr(config, _ATTR, None) + if d is None: + d = {} + setattr(config, _ATTR, d) + return d + + +def get_temp_log(config: pytest.Config) -> list[tuple[float, str, float]]: + """Return (creating if needed) the session-scoped temperature log.""" + log = getattr(config, _TEMP_ATTR, None) + if log is None: + log = [] + setattr(config, _TEMP_ATTR, log) + return log + + +def pytest_configure(config: pytest.Config) -> None: + if config.getoption("--measure-baselines", default=False): + # Wipe previous measured/ so aborted runs don't leave stale data. + if _MEASURED_DIR.exists(): + shutil.rmtree(_MEASURED_DIR) + _MEASURED_DIR.mkdir(parents=True, exist_ok=True) + + +def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None: + # Write temperature log if path was set + import os + + temp_log_path = os.environ.get("TEMP_LOG_PATH", "") + if temp_log_path: + log = get_temp_log(session.config) + if log: + t0 = log[0][0] + with open(temp_log_path, "w") as f: + f.write("elapsed_s,label,temp_C\n") + for ts, label, temp in log: + f.write(f"{ts - t0:.2f},{label},{temp:.1f}\n") + + if not session.config.getoption("--measure-baselines", default=False): + return + + results = get_measured_results(session.config) + if not results: + return + + _MEASURED_DIR.mkdir(parents=True, exist_ok=True) + + for gpu, shapes in results.items(): + out_path = _MEASURED_DIR / f"hybrid_w4a16_{gpu}.json" + data = {"gpu": gpu, "shapes": shapes} + out_path.write_text(json.dumps(data, indent=2) + "\n") diff --git a/tests/kernels/quantization/golden/README.md b/tests/kernels/quantization/golden/README.md new file mode 100644 index 000000000000..e3dfee5d085d --- /dev/null +++ b/tests/kernels/quantization/golden/README.md @@ -0,0 +1,119 @@ +# Golden Baselines for Kernel Performance Regression Tests + + This directory contains per-GPU JSON files with golden TFLOP/s baselines for + the hybrid W4A16 kernel. The test in `test_hybrid_w4a16_perf.py` compares + measured performance against these values using a two-sided tolerance band. + +## JSON Schema + + ```json + { + "gpu": "", + "shapes": [ + { + "in_features": 2560, + "out_features": 3840, + "group_size": 128, + "comment": "Qwen3-4B qkv_proj", + "skip": "(optional) reason to skip entire shape", + "providers": [ + { + "provider": "hybrid-w4a16", + "skip": "(optional) reason to skip this provider", + "baselines": [ + { + "batch_size": 1, + "tflops": 5.12, + "expected_failure": "(optional) reason this is expected to fail", + "intermittent": false, + "skip": "(optional) reason to skip this batch size" + } + ] + } + ] + } + ] + } + ``` + +### Fields + + | Field | Level | Description | + | --- | --- | --- | + | `gpu` | top | GCN architecture prefix (e.g. `gfx1151`). Matched against `_GCN_ARCH` at runtime. | + | `in_features` | shape | K dimension of the GEMM. | + | `out_features` | shape | N dimension of the GEMM. | + | `group_size` | shape | Quantization group size (typically 128). | + | `comment` | shape | Human-readable label (model name + layer). | + | `skip` | shape/provider/baseline | When present, the item is skipped. Value is the reason string. `tflops` is not required when `skip` is set. | + | `provider` | provider | Kernel variant: `hybrid-w4a16` or `hybrid-w4a16-zp`. | + | `tflops` | baseline | Golden TFLOP/s value for this batch size. | + | `batch_size` | baseline | M dimension (number of tokens). | + | `expected_failure` | baseline | When present, out-of-band results are silently accepted. If the measurement lands *inside* the band, the test errors (unexpected pass). | + | `intermittent` | baseline | When `true`, this batch size is skipped unless `--intermittent` is passed. | + +### Constraints + +- Shapes in the JSON must be a subset of `SHAPES` in + `test_hybrid_w4a16_perf.py`. Extra shapes in the JSON cause a collection + error. +- Batch sizes in the JSON must be a subset of `BATCH_SIZES` in the test file. + Extra batch sizes cause a collection error. +- Shapes are sorted by `(in_features, out_features, group_size)` for clean + diffs. + +## Adding a New GPU Target + + 1. Run `--measure-baselines` on the new GPU. No golden file needs to exist + first -- the test bootstraps from `SHAPES x PROVIDERS x BATCH_SIZES`: + + ```bash + .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \ + --measure-baselines -s + ``` + + 2. Inspect `measured/hybrid_w4a16_.json`. + + 3. Copy the measured file here: + + ```bash + cp tests/kernels/quantization/measured/hybrid_w4a16_.json \ + tests/kernels/quantization/golden/ + ``` + + 4. Commit. No code changes needed -- the test auto-discovers JSON files by + GPU match. + +## Adding New Shapes + + 1. Add the shape to `SHAPES` in `test_hybrid_w4a16_perf.py`. + 2. Run `--measure-baselines` on each GPU that has a golden file. + 3. Diff `golden/` vs `measured/`, copy updated files, commit. + +## Contributor Workflow + +### No performance impact expected + + Push the change. CI runs the test. If it passes, done. + +### Intentional performance change + + ```bash + # On a matching GPU: + .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \ + --measure-baselines -s + + # Compare: + diff -r tests/kernels/quantization/golden/ \ + tests/kernels/quantization/measured/ + + # Update golden values: + cp tests/kernels/quantization/measured/hybrid_w4a16_gfx1151.json \ + tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json + + # Commit JSON alongside the kernel change: + git add tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json + git commit + ``` + + The `measured/` directory is in `.gitignore` and should never be committed. diff --git a/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json new file mode 100644 index 000000000000..ad09bf67d162 --- /dev/null +++ b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json @@ -0,0 +1,2079 @@ +{ + "gpu": "gfx1151", + "shapes": [ + { + "in_features": 2048, + "out_features": 2048, + "group_size": 128, + "comment": "gemma-2b o_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.3956 + }, + { + "batch_size": 2, + "tflops": 2.1364 + }, + { + "batch_size": 4, + "tflops": 2.6879 + }, + { + "batch_size": 8, + "tflops": 3.4325 + }, + { + "batch_size": 16, + "tflops": 6.7379 + }, + { + "batch_size": 32, + "tflops": 13.0322 + }, + { + "batch_size": 64, + "tflops": 12.7756 + }, + { + "batch_size": 128, + "tflops": 20.2416 + }, + { + "batch_size": 256, + "tflops": 17.5314 + }, + { + "batch_size": 512, + "tflops": 18.471 + }, + { + "batch_size": 1024, + "tflops": 18.5844 + }, + { + "batch_size": 2048, + "tflops": 23.7047 + }, + { + "batch_size": 4096, + "tflops": 23.3858 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.2443 + }, + { + "batch_size": 2, + "tflops": 1.905 + }, + { + "batch_size": 4, + "tflops": 2.5669 + }, + { + "batch_size": 8, + "tflops": 3.4513 + }, + { + "batch_size": 16, + "tflops": 6.8081 + }, + { + "batch_size": 32, + "tflops": 13.0672 + }, + { + "batch_size": 64, + "tflops": 12.2089 + }, + { + "batch_size": 128, + "tflops": 19.5104 + }, + { + "batch_size": 256, + "tflops": 17.2851 + }, + { + "batch_size": 512, + "tflops": 17.2427 + }, + { + "batch_size": 1024, + "tflops": 18.065 + }, + { + "batch_size": 2048, + "tflops": 22.2069 + }, + { + "batch_size": 4096, + "tflops": 22.3827 + } + ] + } + ] + }, + { + "in_features": 2048, + "out_features": 2560, + "group_size": 128, + "comment": "gemma-2b qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.2669 + }, + { + "batch_size": 2, + "tflops": 2.1339 + }, + { + "batch_size": 4, + "tflops": 3.0023 + }, + { + "batch_size": 8, + "tflops": 3.875 + }, + { + "batch_size": 16, + "tflops": 7.5271 + }, + { + "batch_size": 32, + "tflops": 14.0853 + }, + { + "batch_size": 64, + "tflops": 13.6546 + }, + { + "batch_size": 128, + "tflops": 21.2751 + }, + { + "batch_size": 256, + "tflops": 17.3688 + }, + { + "batch_size": 512, + "tflops": 17.7514 + }, + { + "batch_size": 1024, + "tflops": 19.0102 + }, + { + "batch_size": 2048, + "tflops": 24.1754 + }, + { + "batch_size": 4096, + "tflops": 24.3621 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.1927 + }, + { + "batch_size": 2, + "tflops": 1.9083 + }, + { + "batch_size": 4, + "tflops": 2.8705 + }, + { + "batch_size": 8, + "tflops": 3.7228 + }, + { + "batch_size": 16, + "tflops": 7.3855 + }, + { + "batch_size": 32, + "tflops": 13.791 + }, + { + "batch_size": 64, + "tflops": 13.0059 + }, + { + "batch_size": 128, + "tflops": 21.3741 + }, + { + "batch_size": 256, + "tflops": 16.9618 + }, + { + "batch_size": 512, + "tflops": 17.3823 + }, + { + "batch_size": 1024, + "tflops": 18.4831 + }, + { + "batch_size": 2048, + "tflops": 23.381 + }, + { + "batch_size": 4096, + "tflops": 23.9851 + } + ] + } + ] + }, + { + "in_features": 2048, + "out_features": 32768, + "group_size": 128, + "comment": "gemma-2b gate_up_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9247 + }, + { + "batch_size": 2, + "tflops": 1.7998 + }, + { + "batch_size": 4, + "tflops": 3.4455 + }, + { + "batch_size": 8, + "tflops": 4.4411 + }, + { + "batch_size": 16, + "tflops": 8.6629 + }, + { + "batch_size": 32, + "tflops": 16.4144 + }, + { + "batch_size": 64, + "tflops": 9.2579 + }, + { + "batch_size": 128, + "tflops": 21.4924 + }, + { + "batch_size": 256, + "tflops": 25.5516 + }, + { + "batch_size": 512, + "tflops": 25.4765 + }, + { + "batch_size": 1024, + "tflops": 24.4269 + }, + { + "batch_size": 2048, + "tflops": 21.9715 + }, + { + "batch_size": 4096, + "tflops": 19.8008 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.8996 + }, + { + "batch_size": 2, + "tflops": 1.7079 + }, + { + "batch_size": 4, + "tflops": 3.2741 + }, + { + "batch_size": 8, + "tflops": 4.1717 + }, + { + "batch_size": 16, + "tflops": 8.0755 + }, + { + "batch_size": 32, + "tflops": 15.5206 + }, + { + "batch_size": 64, + "tflops": 7.4691 + }, + { + "batch_size": 128, + "tflops": 20.8705 + }, + { + "batch_size": 256, + "tflops": 25.5421 + }, + { + "batch_size": 512, + "tflops": 25.4748 + }, + { + "batch_size": 1024, + "tflops": 24.6075 + }, + { + "batch_size": 2048, + "tflops": 22.0469 + }, + { + "batch_size": 4096, + "tflops": 19.8537 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 2560, + "group_size": 128, + "comment": "Qwen3-4B o_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.0679 + }, + { + "batch_size": 2, + "tflops": 1.9995 + }, + { + "batch_size": 4, + "tflops": 3.161 + }, + { + "batch_size": 8, + "tflops": 4.3066 + }, + { + "batch_size": 16, + "tflops": 8.146 + }, + { + "batch_size": 32, + "tflops": 15.5181 + }, + { + "batch_size": 64, + "tflops": 17.0524 + }, + { + "batch_size": 128, + "tflops": 20.2958 + }, + { + "batch_size": 256, + "tflops": 22.0882 + }, + { + "batch_size": 512, + "tflops": 23.9008 + }, + { + "batch_size": 1024, + "tflops": 24.4399 + }, + { + "batch_size": 2048, + "tflops": 24.7905 + }, + { + "batch_size": 4096, + "tflops": 24.621 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9698 + }, + { + "batch_size": 2, + "tflops": 1.8477 + }, + { + "batch_size": 4, + "tflops": 3.0152 + }, + { + "batch_size": 8, + "tflops": 4.3119 + }, + { + "batch_size": 16, + "tflops": 8.198 + }, + { + "batch_size": 32, + "tflops": 15.6252 + }, + { + "batch_size": 64, + "tflops": 16.0473 + }, + { + "batch_size": 128, + "tflops": 19.2885 + }, + { + "batch_size": 256, + "tflops": 20.8525 + }, + { + "batch_size": 512, + "tflops": 23.1322 + }, + { + "batch_size": 1024, + "tflops": 24.1483 + }, + { + "batch_size": 2048, + "tflops": 24.2402 + }, + { + "batch_size": 4096, + "tflops": 24.7659 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 3840, + "group_size": 128, + "comment": "Qwen3-4B qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9975 + }, + { + "batch_size": 2, + "tflops": 1.8936 + }, + { + "batch_size": 4, + "tflops": 3.8242 + }, + { + "batch_size": 8, + "tflops": 4.4486 + }, + { + "batch_size": 16, + "tflops": 8.6678 + }, + { + "batch_size": 32, + "tflops": 16.6232 + }, + { + "batch_size": 64, + "tflops": 17.7952 + }, + { + "batch_size": 128, + "tflops": 23.7005 + }, + { + "batch_size": 256, + "tflops": 24.3613 + }, + { + "batch_size": 512, + "tflops": 21.7255 + }, + { + "batch_size": 1024, + "tflops": 25.0983 + }, + { + "batch_size": 2048, + "tflops": 25.7907 + }, + { + "batch_size": 4096, + "tflops": 26.2186 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9196 + }, + { + "batch_size": 2, + "tflops": 1.7588 + }, + { + "batch_size": 4, + "tflops": 3.4896 + }, + { + "batch_size": 8, + "tflops": 4.38 + }, + { + "batch_size": 16, + "tflops": 8.4495 + }, + { + "batch_size": 32, + "tflops": 16.3339 + }, + { + "batch_size": 64, + "tflops": 17.2289 + }, + { + "batch_size": 128, + "tflops": 23.5615 + }, + { + "batch_size": 256, + "tflops": 23.8283 + }, + { + "batch_size": 512, + "tflops": 21.3179 + }, + { + "batch_size": 1024, + "tflops": 24.2931 + }, + { + "batch_size": 2048, + "tflops": 25.3921 + }, + { + "batch_size": 4096, + "tflops": 25.9323 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 19456, + "group_size": 128, + "comment": "Qwen3-4B gate_up_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.894 + }, + { + "batch_size": 2, + "tflops": 1.7246 + }, + { + "batch_size": 4, + "tflops": 3.3242 + }, + { + "batch_size": 8, + "tflops": 4.5236 + }, + { + "batch_size": 16, + "tflops": 8.645 + }, + { + "batch_size": 32, + "tflops": 16.7812 + }, + { + "batch_size": 64, + "tflops": 21.7332 + }, + { + "batch_size": 128, + "tflops": 24.898 + }, + { + "batch_size": 256, + "tflops": 25.4871 + }, + { + "batch_size": 512, + "tflops": 25.5385 + }, + { + "batch_size": 1024, + "tflops": 25.4898 + }, + { + "batch_size": 2048, + "tflops": 25.2532 + }, + { + "batch_size": 4096, + "tflops": 24.7898 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.8544 + }, + { + "batch_size": 2, + "tflops": 1.6239 + }, + { + "batch_size": 4, + "tflops": 3.1761 + }, + { + "batch_size": 8, + "tflops": 4.4362 + }, + { + "batch_size": 16, + "tflops": 8.4064 + }, + { + "batch_size": 32, + "tflops": 16.3875 + }, + { + "batch_size": 64, + "tflops": 15.2236 + }, + { + "batch_size": 128, + "tflops": 24.6912 + }, + { + "batch_size": 256, + "tflops": 25.3165 + }, + { + "batch_size": 512, + "tflops": 25.2057 + }, + { + "batch_size": 1024, + "tflops": 24.8581 + }, + { + "batch_size": 2048, + "tflops": 24.9453 + }, + { + "batch_size": 4096, + "tflops": 24.8502 + } + ] + } + ] + }, + { + "in_features": 3584, + "out_features": 3584, + "group_size": 128, + "comment": "Qwen2.5-7B o_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9845 + }, + { + "batch_size": 2, + "tflops": 1.8544 + }, + { + "batch_size": 4, + "tflops": 3.7296 + }, + { + "batch_size": 8, + "tflops": 4.2831 + }, + { + "batch_size": 16, + "tflops": 8.3524 + }, + { + "batch_size": 32, + "tflops": 16.109 + }, + { + "batch_size": 64, + "tflops": 17.7894 + }, + { + "batch_size": 128, + "tflops": 18.3642 + }, + { + "batch_size": 256, + "tflops": 23.222 + }, + { + "batch_size": 512, + "tflops": 21.6664 + }, + { + "batch_size": 1024, + "tflops": 24.1986 + }, + { + "batch_size": 2048, + "tflops": 25.5559 + }, + { + "batch_size": 4096, + "tflops": 25.6219 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9254 + }, + { + "batch_size": 2, + "tflops": 1.7193 + }, + { + "batch_size": 4, + "tflops": 3.3911 + }, + { + "batch_size": 8, + "tflops": 4.1332 + }, + { + "batch_size": 16, + "tflops": 7.9547 + }, + { + "batch_size": 32, + "tflops": 15.5994 + }, + { + "batch_size": 64, + "tflops": 17.3155 + }, + { + "batch_size": 128, + "tflops": 17.7375 + }, + { + "batch_size": 256, + "tflops": 21.7902 + }, + { + "batch_size": 512, + "tflops": 21.1015 + }, + { + "batch_size": 1024, + "tflops": 23.198 + }, + { + "batch_size": 2048, + "tflops": 24.9341 + }, + { + "batch_size": 4096, + "tflops": 25.4049 + } + ] + } + ] + }, + { + "in_features": 3584, + "out_features": 4608, + "group_size": 128, + "comment": "Qwen2.5-7B qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9589 + }, + { + "batch_size": 2, + "tflops": 1.8141 + }, + { + "batch_size": 4, + "tflops": 3.6611 + }, + { + "batch_size": 8, + "tflops": 4.2792 + }, + { + "batch_size": 16, + "tflops": 8.2917 + }, + { + "batch_size": 32, + "tflops": 16.0026 + }, + { + "batch_size": 64, + "tflops": 18.9172 + }, + { + "batch_size": 128, + "tflops": 23.4304 + }, + { + "batch_size": 256, + "tflops": 22.9194 + }, + { + "batch_size": 512, + "tflops": 22.9634 + }, + { + "batch_size": 1024, + "tflops": 23.9841 + }, + { + "batch_size": 2048, + "tflops": 25.1618 + }, + { + "batch_size": 4096, + "tflops": 24.8986 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9077 + }, + { + "batch_size": 2, + "tflops": 1.6596 + }, + { + "batch_size": 4, + "tflops": 3.3327 + }, + { + "batch_size": 8, + "tflops": 4.1092 + }, + { + "batch_size": 16, + "tflops": 7.2905 + }, + { + "batch_size": 32, + "tflops": 14.346 + }, + { + "batch_size": 64, + "tflops": 17.8276 + }, + { + "batch_size": 128, + "tflops": 22.2431 + }, + { + "batch_size": 256, + "tflops": 21.614 + }, + { + "batch_size": 512, + "tflops": 22.1398 + }, + { + "batch_size": 1024, + "tflops": 22.8507 + }, + { + "batch_size": 2048, + "tflops": 24.3894 + }, + { + "batch_size": 4096, + "tflops": 24.473 + } + ] + } + ] + }, + { + "in_features": 3584, + "out_features": 37888, + "group_size": 128, + "comment": "Qwen2.5-7B gate_up_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9494 + }, + { + "batch_size": 2, + "tflops": 1.7944 + }, + { + "batch_size": 4, + "tflops": 3.5244 + }, + { + "batch_size": 8, + "tflops": 4.5967 + }, + { + "batch_size": 16, + "tflops": 8.8868 + }, + { + "batch_size": 32, + "tflops": 17.446 + }, + { + "batch_size": 64, + "tflops": 21.4765 + }, + { + "batch_size": 128, + "tflops": 24.5377 + }, + { + "batch_size": 256, + "tflops": 25.5151 + }, + { + "batch_size": 512, + "tflops": 25.6997 + }, + { + "batch_size": 1024, + "tflops": 25.4979 + }, + { + "batch_size": 2048, + "tflops": 25.4565 + }, + { + "batch_size": 4096, + "tflops": 25.6372 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9444 + }, + { + "batch_size": 2, + "tflops": 1.7198 + }, + { + "batch_size": 4, + "tflops": 3.3862 + }, + { + "batch_size": 8, + "tflops": 4.445 + }, + { + "batch_size": 16, + "tflops": 8.5539 + }, + { + "batch_size": 32, + "tflops": 16.8321 + }, + { + "batch_size": 64, + "tflops": 14.6204 + }, + { + "batch_size": 128, + "tflops": 24.8905 + }, + { + "batch_size": 256, + "tflops": 25.3239 + }, + { + "batch_size": 512, + "tflops": 25.0395 + }, + { + "batch_size": 1024, + "tflops": 24.8458 + }, + { + "batch_size": 2048, + "tflops": 24.881 + }, + { + "batch_size": 4096, + "tflops": 25.3841 + } + ] + } + ] + }, + { + "in_features": 8192, + "out_features": 512, + "group_size": 128, + "comment": "L2 2MiB at", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.1894 + }, + { + "batch_size": 2, + "tflops": 1.7129 + }, + { + "batch_size": 4, + "tflops": 2.5526 + }, + { + "batch_size": 8, + "tflops": 1.6675 + }, + { + "batch_size": 16, + "tflops": 3.1527 + }, + { + "batch_size": 32, + "tflops": 5.4616 + }, + { + "batch_size": 64, + "tflops": 3.5551 + }, + { + "batch_size": 128, + "tflops": 9.7177 + }, + { + "batch_size": 256, + "tflops": 16.9947 + }, + { + "batch_size": 512, + "tflops": 19.7511 + }, + { + "batch_size": 1024, + "tflops": 23.1965 + }, + { + "batch_size": 2048, + "tflops": 17.2034 + }, + { + "batch_size": 4096, + "tflops": 14.4072 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9803 + }, + { + "batch_size": 2, + "tflops": 1.4679 + }, + { + "batch_size": 4, + "tflops": 2.4295 + }, + { + "batch_size": 8, + "tflops": 1.5168 + }, + { + "batch_size": 16, + "tflops": 2.9111 + }, + { + "batch_size": 32, + "tflops": 5.0758 + }, + { + "batch_size": 64, + "tflops": 3.3141 + }, + { + "batch_size": 128, + "tflops": 9.3917 + }, + { + "batch_size": 256, + "tflops": 16.8781 + }, + { + "batch_size": 512, + "tflops": 19.6022 + }, + { + "batch_size": 1024, + "tflops": 23.0311 + }, + { + "batch_size": 2048, + "tflops": 16.2197 + }, + { + "batch_size": 4096, + "tflops": 12.3815 + } + ] + } + ] + }, + { + "in_features": 8320, + "out_features": 512, + "group_size": 128, + "comment": "L2 2MiB above", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.1795 + }, + { + "batch_size": 2, + "tflops": 1.794 + }, + { + "batch_size": 4, + "tflops": 0.8467 + }, + { + "batch_size": 8, + "tflops": 1.6837 + }, + { + "batch_size": 16, + "tflops": 3.3689 + }, + { + "batch_size": 32, + "tflops": 6.6263 + }, + { + "batch_size": 64, + "tflops": 4.5296 + }, + { + "batch_size": 128, + "tflops": 10.7239 + }, + { + "batch_size": 256, + "tflops": 17.3085 + }, + { + "batch_size": 512, + "tflops": 19.827 + }, + { + "batch_size": 1024, + "tflops": 24.4261 + }, + { + "batch_size": 2048, + "tflops": 20.3098 + }, + { + "batch_size": 4096, + "tflops": 20.7601 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 1.0345 + }, + { + "batch_size": 2, + "tflops": 1.6052 + }, + { + "batch_size": 4, + "tflops": 0.7527 + }, + { + "batch_size": 8, + "tflops": 1.4949 + }, + { + "batch_size": 16, + "tflops": 2.9586 + }, + { + "batch_size": 32, + "tflops": 5.8479 + }, + { + "batch_size": 64, + "tflops": 4.2138 + }, + { + "batch_size": 128, + "tflops": 10.9798 + }, + { + "batch_size": 256, + "tflops": 16.9799 + }, + { + "batch_size": 512, + "tflops": 19.4053 + }, + { + "batch_size": 1024, + "tflops": 23.9872 + }, + { + "batch_size": 2048, + "tflops": 20.3439 + }, + { + "batch_size": 4096, + "tflops": 21.2659 + } + ] + } + ] + }, + { + "in_features": 9728, + "out_features": 2560, + "group_size": 128, + "comment": "Qwen3-4B down_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9417 + }, + { + "batch_size": 2, + "tflops": 1.8368 + }, + { + "batch_size": 4, + "tflops": 2.2093 + }, + { + "batch_size": 8, + "tflops": 4.2852 + }, + { + "batch_size": 16, + "tflops": 8.5736 + }, + { + "batch_size": 32, + "tflops": 16.6779 + }, + { + "batch_size": 64, + "tflops": 16.5347 + }, + { + "batch_size": 128, + "tflops": 24.0537 + }, + { + "batch_size": 256, + "tflops": 25.0934 + }, + { + "batch_size": 512, + "tflops": 25.5208 + }, + { + "batch_size": 1024, + "tflops": 24.8518 + }, + { + "batch_size": 2048, + "tflops": 24.6247 + }, + { + "batch_size": 4096, + "tflops": 24.7359 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.9087 + }, + { + "batch_size": 2, + "tflops": 1.7045 + }, + { + "batch_size": 4, + "tflops": 2.2514 + }, + { + "batch_size": 8, + "tflops": 4.3491 + }, + { + "batch_size": 16, + "tflops": 8.6603 + }, + { + "batch_size": 32, + "tflops": 16.8873 + }, + { + "batch_size": 64, + "tflops": 15.7646 + }, + { + "batch_size": 128, + "tflops": 23.9267 + }, + { + "batch_size": 256, + "tflops": 24.96 + }, + { + "batch_size": 512, + "tflops": 25.5181 + }, + { + "batch_size": 1024, + "tflops": 24.9927 + }, + { + "batch_size": 2048, + "tflops": 23.9842 + }, + { + "batch_size": 4096, + "tflops": 23.4345 + } + ] + } + ] + }, + { + "in_features": 16384, + "out_features": 2048, + "group_size": 128, + "comment": "gemma-2b down_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.6622 + }, + { + "batch_size": 2, + "tflops": 1.2574 + }, + { + "batch_size": 4, + "tflops": 1.7651 + }, + { + "batch_size": 8, + "tflops": 3.4955 + }, + { + "batch_size": 16, + "tflops": 6.9043 + }, + { + "batch_size": 32, + "tflops": 13.3775 + }, + { + "batch_size": 64, + "tflops": 12.3375 + }, + { + "batch_size": 128, + "tflops": 19.4665 + }, + { + "batch_size": 256, + "tflops": 22.2801 + }, + { + "batch_size": 512, + "tflops": 20.5712 + }, + { + "batch_size": 1024, + "tflops": 22.1274 + }, + { + "batch_size": 2048, + "tflops": 14.5177 + }, + { + "batch_size": 4096, + "tflops": 15.7439 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.619 + }, + { + "batch_size": 2, + "tflops": 1.1881 + }, + { + "batch_size": 4, + "tflops": 1.6908 + }, + { + "batch_size": 8, + "tflops": 3.3439 + }, + { + "batch_size": 16, + "tflops": 6.5966 + }, + { + "batch_size": 32, + "tflops": 12.7721 + }, + { + "batch_size": 64, + "tflops": 10.3608 + }, + { + "batch_size": 128, + "tflops": 17.3985 + }, + { + "batch_size": 256, + "tflops": 21.4558 + }, + { + "batch_size": 512, + "tflops": 19.6548 + }, + { + "batch_size": 1024, + "tflops": 21.6144 + }, + { + "batch_size": 2048, + "tflops": 13.1811 + }, + { + "batch_size": 4096, + "tflops": 14.2666 + } + ] + } + ] + }, + { + "in_features": 18944, + "out_features": 3584, + "group_size": 128, + "comment": "Qwen2.5-7B down_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.8843 + }, + { + "batch_size": 2, + "tflops": 1.1249 + }, + { + "batch_size": 4, + "tflops": 2.2302 + }, + { + "batch_size": 8, + "tflops": 4.4101 + }, + { + "batch_size": 16, + "tflops": 8.4345 + }, + { + "batch_size": 32, + "tflops": 16.6272 + }, + { + "batch_size": 64, + "tflops": 18.1639 + }, + { + "batch_size": 128, + "tflops": 17.9899 + }, + { + "batch_size": 256, + "tflops": 22.8929 + }, + { + "batch_size": 512, + "tflops": 24.0236 + }, + { + "batch_size": 1024, + "tflops": 23.892 + }, + { + "batch_size": 2048, + "tflops": 23.0834 + }, + { + "batch_size": 4096, + "tflops": 23.6381 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.8621 + }, + { + "batch_size": 2, + "tflops": 1.0963 + }, + { + "batch_size": 4, + "tflops": 2.1254 + }, + { + "batch_size": 8, + "tflops": 4.1472 + }, + { + "batch_size": 16, + "tflops": 8.2929 + }, + { + "batch_size": 32, + "tflops": 16.0558 + }, + { + "batch_size": 64, + "tflops": 16.2587 + }, + { + "batch_size": 128, + "tflops": 17.5202 + }, + { + "batch_size": 256, + "tflops": 22.6163 + }, + { + "batch_size": 512, + "tflops": 24.0904 + }, + { + "batch_size": 1024, + "tflops": 24.3297 + }, + { + "batch_size": 2048, + "tflops": 22.3802 + }, + { + "batch_size": 4096, + "tflops": 22.4165 + } + ] + } + ] + }, + { + "in_features": 38912, + "out_features": 2048, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 != 0", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.4265 + }, + { + "batch_size": 2, + "tflops": 0.8436 + }, + { + "batch_size": 4, + "tflops": 1.6834 + }, + { + "batch_size": 8, + "tflops": 3.1377 + }, + { + "batch_size": 16, + "tflops": 5.5249 + }, + { + "batch_size": 32, + "tflops": 9.3102 + }, + { + "batch_size": 64, + "tflops": 6.8117 + }, + { + "batch_size": 128, + "tflops": 18.2063 + }, + { + "batch_size": 256, + "tflops": 23.2781 + }, + { + "batch_size": 512, + "tflops": 21.9953 + }, + { + "batch_size": 1024, + "tflops": 20.45 + }, + { + "batch_size": 2048, + "tflops": 15.5919 + }, + { + "batch_size": 4096, + "tflops": 16.4851 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.3541 + }, + { + "batch_size": 2, + "tflops": 0.6942 + }, + { + "batch_size": 4, + "tflops": 1.3525 + }, + { + "batch_size": 8, + "tflops": 2.558 + }, + { + "batch_size": 16, + "tflops": 4.5556 + }, + { + "batch_size": 32, + "tflops": 7.6537 + }, + { + "batch_size": 64, + "tflops": 6.5258 + }, + { + "batch_size": 128, + "tflops": 17.52 + }, + { + "batch_size": 256, + "tflops": 22.7488 + }, + { + "batch_size": 512, + "tflops": 21.7849 + }, + { + "batch_size": 1024, + "tflops": 19.1522 + }, + { + "batch_size": 2048, + "tflops": 13.7515 + }, + { + "batch_size": 4096, + "tflops": 14.5751 + } + ] + } + ] + }, + { + "in_features": 49152, + "out_features": 2048, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 == 0", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.2223 + }, + { + "batch_size": 2, + "tflops": 0.4405 + }, + { + "batch_size": 4, + "tflops": 0.8804 + }, + { + "batch_size": 8, + "tflops": 1.7199 + }, + { + "batch_size": 16, + "tflops": 3.3486 + }, + { + "batch_size": 32, + "tflops": 6.1204 + }, + { + "batch_size": 64, + "tflops": 5.0983 + }, + { + "batch_size": 128, + "tflops": 13.5035 + }, + { + "batch_size": 256, + "tflops": 17.7353 + }, + { + "batch_size": 512, + "tflops": 17.5016 + }, + { + "batch_size": 1024, + "tflops": 19.0624 + }, + { + "batch_size": 2048, + "tflops": 13.322 + }, + { + "batch_size": 4096, + "tflops": 14.6727 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.1883 + }, + { + "batch_size": 2, + "tflops": 0.3725 + }, + { + "batch_size": 4, + "tflops": 0.7315 + }, + { + "batch_size": 8, + "tflops": 1.3876 + }, + { + "batch_size": 16, + "tflops": 2.6876 + }, + { + "batch_size": 32, + "tflops": 4.7003 + }, + { + "batch_size": 64, + "tflops": 4.3578 + }, + { + "batch_size": 128, + "tflops": 12.7461 + }, + { + "batch_size": 256, + "tflops": 16.4239 + }, + { + "batch_size": 512, + "tflops": 16.4054 + }, + { + "batch_size": 1024, + "tflops": 17.5356 + }, + { + "batch_size": 2048, + "tflops": 12.2891 + }, + { + "batch_size": 4096, + "tflops": 13.2295 + } + ] + } + ] + }, + { + "in_features": 49152, + "out_features": 4096, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 == 0", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.2614 + }, + { + "batch_size": 2, + "tflops": 0.5111 + }, + { + "batch_size": 4, + "tflops": 0.9764 + }, + { + "batch_size": 8, + "tflops": 2.0766 + }, + { + "batch_size": 16, + "tflops": 3.9968 + }, + { + "batch_size": 32, + "tflops": 7.3814 + }, + { + "batch_size": 64, + "tflops": 6.0318 + }, + { + "batch_size": 128, + "tflops": 9.0754 + }, + { + "batch_size": 256, + "tflops": 11.8871 + }, + { + "batch_size": 512, + "tflops": 17.5524 + }, + { + "batch_size": 1024, + "tflops": 20.4928 + }, + { + "batch_size": 2048, + "tflops": 14.634 + }, + { + "batch_size": 4096, + "tflops": 15.3292 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "tflops": 0.1923 + }, + { + "batch_size": 2, + "tflops": 0.3716 + }, + { + "batch_size": 4, + "tflops": 0.729 + }, + { + "batch_size": 8, + "tflops": 1.5244 + }, + { + "batch_size": 16, + "tflops": 3.0343 + }, + { + "batch_size": 32, + "tflops": 5.8279 + }, + { + "batch_size": 64, + "tflops": 5.0934 + }, + { + "batch_size": 128, + "tflops": 9.985 + }, + { + "batch_size": 256, + "tflops": 11.8763 + }, + { + "batch_size": 512, + "tflops": 15.1345 + }, + { + "batch_size": 1024, + "tflops": 18.5422 + }, + { + "batch_size": 2048, + "tflops": 13.0673 + }, + { + "batch_size": 4096, + "tflops": 13.0184 + } + ] + } + ] + } + ] +} diff --git a/tests/kernels/quantization/test_hybrid_w4a16_perf.py b/tests/kernels/quantization/test_hybrid_w4a16_perf.py new file mode 100644 index 000000000000..4e470b8ebafa --- /dev/null +++ b/tests/kernels/quantization/test_hybrid_w4a16_perf.py @@ -0,0 +1,695 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Performance regression tests for the hybrid W4A16 GEMM kernel. + +Compares measured TFLOP/s against golden baselines stored in per-GPU JSON +files under ``golden/``. A two-sided tolerance band catches both regressions +and unexpected improvements. + +Usage:: + + # rep=20 for testing, rep=50 for --measure-baselines. + .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\ + -v -s + + # Measure new baselines (writes to measured/, never overwrites golden/): + .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\ + --measure-baselines -s + + # Include noisy/intermittent cases: + .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\ + --intermittent -v -s + +Golden JSON schema is documented in ``golden/README.md``. +""" + +from __future__ import annotations + +import json +import math +import pathlib +import time +from typing import Any + +import pytest +import torch + +from vllm.platforms import current_platform + +# --------------------------------------------------------------------------- +# GPU temperature reading +# --------------------------------------------------------------------------- + + +def _read_gpu_temp() -> float: + """Return GPU edge temperature in degrees C, or NaN if unavailable.""" + try: + import amdsmi + + amdsmi.amdsmi_init() + devices = amdsmi.amdsmi_get_processor_handles() + if not devices: + return float("nan") + return float( + amdsmi.amdsmi_get_temp_metric( + devices[0], + amdsmi.AmdSmiTemperatureType.EDGE, + amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + ) + except Exception: + return float("nan") + + +def _log_temp(config: Any, label: str) -> float: + """Read and log temperature with a label. Returns temp in C.""" + from tests.kernels.quantization.conftest import get_temp_log + + t = _read_gpu_temp() + get_temp_log(config).append((time.monotonic(), label, t)) + return t + + +# --------------------------------------------------------------------------- +# Constants -- single source of truth +# --------------------------------------------------------------------------- + +SHAPES: list[dict[str, Any]] = [ + # google/gemma-2b-AWQ + { + "in_features": 2048, + "out_features": 32768, + "group_size": 128, + "comment": "gemma-2b gate_up_proj", + }, + { + "in_features": 16384, + "out_features": 2048, + "group_size": 128, + "comment": "gemma-2b down_proj", + }, + { + "in_features": 2048, + "out_features": 2560, + "group_size": 128, + "comment": "gemma-2b qkv_proj", + }, + { + "in_features": 2048, + "out_features": 2048, + "group_size": 128, + "comment": "gemma-2b o_proj", + }, + # hf-kernel: shapes with K > 32768 that dispatch to wvSplitK_int4_hf_ + # instead of wvSplitK_int4_hf_sml_ at batch=1. + { + "in_features": 38912, + "out_features": 2048, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 != 0", + }, + { + "in_features": 49152, + "out_features": 2048, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 == 0", + }, + { + "in_features": 49152, + "out_features": 4096, + "group_size": 128, + "comment": "hf-kernel, K_packed % 4096 == 0", + }, + # Qwen/Qwen3-4B + { + "in_features": 2560, + "out_features": 3840, + "group_size": 128, + "comment": "Qwen3-4B qkv_proj", + }, + { + "in_features": 2560, + "out_features": 2560, + "group_size": 128, + "comment": "Qwen3-4B o_proj", + }, + { + "in_features": 2560, + "out_features": 19456, + "group_size": 128, + "comment": "Qwen3-4B gate_up_proj", + }, + { + "in_features": 9728, + "out_features": 2560, + "group_size": 128, + "comment": "Qwen3-4B down_proj", + }, + # Qwen/Qwen2.5-7B-Instruct + { + "in_features": 3584, + "out_features": 4608, + "group_size": 128, + "comment": "Qwen2.5-7B qkv_proj", + }, + { + "in_features": 3584, + "out_features": 3584, + "group_size": 128, + "comment": "Qwen2.5-7B o_proj", + }, + { + "in_features": 3584, + "out_features": 37888, + "group_size": 128, + "comment": "Qwen2.5-7B gate_up_proj", + }, + { + "in_features": 18944, + "out_features": 3584, + "group_size": 128, + "comment": "Qwen2.5-7B down_proj", + }, + # W4-L2-cache-boundary + { + "in_features": 8192, + "out_features": 512, + "group_size": 128, + "comment": "L2 2MiB at", + }, + { + "in_features": 8320, + "out_features": 512, + "group_size": 128, + "comment": "L2 2MiB above", + }, +] + +PROVIDERS = ["hybrid-w4a16", "hybrid-w4a16-zp"] +BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096] +TFLOPS_TOLERANCE_PCT = [-10, 10] # [low, high] allowed deviation from golden + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +_HERE = pathlib.Path(__file__).resolve().parent +_GOLDEN_DIR = _HERE / "golden" + +# --------------------------------------------------------------------------- +# Shape key helper +# --------------------------------------------------------------------------- + +ShapeKey = tuple[int, int, int] # (in_features, out_features, group_size) + + +def _shape_key(s: dict[str, Any]) -> ShapeKey: + return (s["in_features"], s["out_features"], s["group_size"]) + + +assert len({_shape_key(s) for s in SHAPES}) == len(SHAPES), "duplicate in SHAPES" +_SHAPE_KEY_SET: set[ShapeKey] = {_shape_key(s) for s in SHAPES} +_BATCH_SIZE_SET: set[int] = set(BATCH_SIZES) + +# --------------------------------------------------------------------------- +# GPU detection +# --------------------------------------------------------------------------- + + +def _get_gcn_arch() -> str: + """Return the GCN architecture string, or '' on non-ROCm.""" + if not current_platform.is_rocm(): + return "" + try: + from vllm.platforms.rocm import _GCN_ARCH + + return _GCN_ARCH + except ImportError: + return "" + + +# --------------------------------------------------------------------------- +# Golden-data loading +# --------------------------------------------------------------------------- + + +def _load_golden(gcn_arch: str) -> tuple[str | None, dict[str, Any] | None]: + """Find and load the golden JSON for *gcn_arch*. + + Returns ``(filename, data)`` or ``(None, None)`` when no match. + """ + for path in sorted(_GOLDEN_DIR.glob("hybrid_w4a16_*.json")): + data = json.loads(path.read_text()) + if gcn_arch.startswith(data.get("gpu", "")): + return path.name, data + return None, None + + +def _validate_golden(data: dict[str, Any]) -> None: + """Raise on rogue shapes or batch sizes in the golden file.""" + for shape in data.get("shapes", []): + sk = _shape_key(shape) + if "skip" not in shape and sk not in _SHAPE_KEY_SET: + raise ValueError( + f"Golden file contains shape {sk} not in SHAPES. " + "Remove it or add it to SHAPES in the test file." + ) + for prov in shape.get("providers", []): + for bl in prov.get("baselines", []): + bs = bl.get("batch_size") + if bs is not None and bs not in _BATCH_SIZE_SET: + raise ValueError( + f"Golden file contains batch_size={bs} " + f"(shape {sk}) not in BATCH_SIZES." + ) + + +# --------------------------------------------------------------------------- +# Weight preparation (ported from benchmark script) +# --------------------------------------------------------------------------- + + +def prepare_hybrid_weights( + K: int, N: int, group_size: int, device: str = "cuda" +) -> dict[str, torch.Tensor]: + """Create random packed weights for benchmarking.""" + num_groups = K // group_size + + w_q_skinny_i32 = torch.randint( + 0, 2**31, (N, K // 8), dtype=torch.int32, device=device + ) + w_q_skinny = w_q_skinny_i32.view(torch.int8).contiguous() + w_s_skinny = torch.randn(N, num_groups, dtype=torch.float16, device=device) * 0.01 + w_zp = torch.randint(0, 16, (N, num_groups), dtype=torch.int32, device=device).to( + torch.float16 + ) + + return { + "w_q_skinny": w_q_skinny, + "w_s_skinny": w_s_skinny, + "w_q_skinny_i32": w_q_skinny_i32, + "w_zp": w_zp, + } + + +# --------------------------------------------------------------------------- +# Core measurement +# --------------------------------------------------------------------------- + + +COOL_DOWN_DELAY_S = 2 + + +def measure_tflops( + M: int, + weights: dict[str, torch.Tensor], + K: int, + N: int, + group_size: int, + provider: str, +) -> float: + """Run the kernel and return median TFLOP/s.""" + from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import ( + _hybrid_w4a16_apply_impl, + ) + from vllm.triton_utils import triton + from vllm.utils.platform_utils import num_compute_units + + device = "cuda" + dtype = torch.float16 + a = torch.randn((M, K), device=device, dtype=dtype) + + cu_count = num_compute_units() + use_zp = provider == "hybrid-w4a16-zp" + + def run(): + return _hybrid_w4a16_apply_impl( + a, + weights["w_q_skinny"], + weights["w_s_skinny"], + weights["w_q_skinny_i32"], + weights["w_zp"] if use_zp else None, + None, # bias + cu_count, + group_size, + ) + + ms = triton.testing.do_bench_cudagraph(run, quantiles=[0.5]) + tflops = (2 * M * N * K) * 1e-12 / (ms * 1e-3) + return tflops + + +# --------------------------------------------------------------------------- +# Parametrize helpers +# --------------------------------------------------------------------------- + + +def _make_params() -> list[pytest.param]: + """Build the SHAPES x PROVIDERS parameter list.""" + params = [] + for shape in sorted(SHAPES, key=_shape_key): + for prov in PROVIDERS: + k = shape["in_features"] + n = shape["out_features"] + g = shape["group_size"] + test_id = f"i{k}-o{n}-g{g}-{prov}" + params.append(pytest.param(shape, prov, id=test_id)) + return params + + +# --------------------------------------------------------------------------- +# Lookup helpers +# --------------------------------------------------------------------------- + + +def _find_shape_in_golden( + golden: dict[str, Any], key: ShapeKey +) -> dict[str, Any] | None: + for s in golden.get("shapes", []): + if _shape_key(s) == key: + return s + return None + + +def _find_provider_in_shape( + shape_data: dict[str, Any], provider: str +) -> dict[str, Any] | None: + for p in shape_data.get("providers", []): + if p.get("provider") == provider: + return p + return None + + +def _find_baseline( + provider_data: dict[str, Any], batch_size: int +) -> dict[str, Any] | None: + for bl in provider_data.get("baselines", []): + if bl.get("batch_size") == batch_size: + return bl + return None + + +# --------------------------------------------------------------------------- +# Measured-results collector for --measure-baselines +# --------------------------------------------------------------------------- + + +def _record_measurement( + config: Any, + gpu: str, + shape: dict[str, Any], + provider: str, + batch_size: int, + tflops: float, + annotations: dict[str, Any] | None = None, +) -> None: + """Append a measurement into the session-scoped collector.""" + from tests.kernels.quantization.conftest import get_measured_results + + results = get_measured_results(config) + shapes_list = results.setdefault(gpu, []) + sk = _shape_key(shape) + + # Find or create shape entry + shape_entry = None + for s in shapes_list: + if _shape_key(s) == sk: + shape_entry = s + break + if shape_entry is None: + shape_entry = { + "in_features": sk[0], + "out_features": sk[1], + "group_size": sk[2], + "comment": shape.get("comment", ""), + "providers": [], + } + shapes_list.append(shape_entry) + # Keep sorted + shapes_list.sort(key=_shape_key) + + # Find or create provider entry + prov_entry = _find_provider_in_shape(shape_entry, provider) + if prov_entry is None: + prov_entry = {"provider": provider, "baselines": []} + shape_entry["providers"].append(prov_entry) + + # Build baseline entry + bl: dict[str, Any] = {"batch_size": batch_size, "tflops": round(tflops, 4)} + if annotations: + bl.update(annotations) + prov_entry["baselines"].append(bl) + # Keep batch_sizes sorted + prov_entry["baselines"].sort(key=lambda b: b["batch_size"]) + + +def _record_skip( + config: Any, + gpu: str, + shape: dict[str, Any], + provider: str, + batch_size: int, + reason: str, +) -> None: + """Record a skipped entry in measured results.""" + _record_measurement( + config, + gpu, + shape, + provider, + batch_size, + 0.0, + annotations={"skip": reason}, + ) + # Remove the tflops field since it's not meaningful + from tests.kernels.quantization.conftest import get_measured_results + + shapes_list = get_measured_results(config)[gpu] + for s in shapes_list: + if _shape_key(s) == _shape_key(shape): + for p in s["providers"]: + if p["provider"] == provider: + for bl in p["baselines"]: + if bl["batch_size"] == batch_size and "skip" in bl: + bl.pop("tflops", None) + + +# --------------------------------------------------------------------------- +# The test +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def _warm_up_gpu(): + """Run a throwaway measurement pass to bring the GPU to steady-state temp.""" + if current_platform.is_rocm(): + temp = _read_gpu_temp() + print(f"GPU temperature: {temp:.0f}\u00b0C") + if temp < 60.0: + print("Warming up GPU...") + shape = sorted(SHAPES, key=_shape_key)[0] + K, N, gs = shape["in_features"], shape["out_features"], shape["group_size"] + weights = prepare_hybrid_weights(K, N, gs) + for bs in BATCH_SIZES: + measure_tflops(bs, weights, K, N, gs, PROVIDERS[0]) + del weights + time.sleep(COOL_DOWN_DELAY_S) + yield + + +@pytest.mark.benchmark +@pytest.mark.parametrize("shape,provider", _make_params()) +def test_hybrid_w4a16_perf( + shape: dict[str, Any], + provider: str, + request: pytest.FixtureRequest, + _warm_up_gpu: None, +) -> None: + # ---- gate ---- + gcn_arch = _get_gcn_arch() + if not current_platform.is_rocm() or not gcn_arch.startswith("gfx1151"): + pytest.skip("ROCm gfx1151 only") + + measure_mode = request.config.getoption("--measure-baselines", default=False) + intermittent_mode = ( + request.config.getoption("--intermittent", default=False) or measure_mode + ) + + # ---- load golden ---- + golden_fname, golden = _load_golden(gcn_arch) + if golden is not None: + _validate_golden(golden) + + if golden is None and not measure_mode: + pytest.skip(f"No golden baselines for {gcn_arch}") + + sk = _shape_key(shape) + K, N, group_size = sk + + # ---- shape-level skip ---- + if golden is not None: + shape_data = _find_shape_in_golden(golden, sk) + if shape_data is not None and "skip" in shape_data: + pytest.skip(shape_data["skip"]) + else: + shape_data = None + + # ---- provider-level skip ---- + if shape_data is not None: + prov_data = _find_provider_in_shape(shape_data, provider) + if prov_data is not None and "skip" in prov_data: + pytest.skip(prov_data["skip"]) + else: + prov_data = None + + # If shape not in golden, skip in normal mode + if golden is not None and shape_data is None and not measure_mode: + pytest.skip(f"Shape {sk} not yet measured on {gcn_arch}") + if ( + golden is not None + and shape_data is not None + and prov_data is None + and not measure_mode + ): + pytest.skip(f"Provider {provider} not yet measured for shape {sk}") + + # ---- cooldown + temperature log ---- + test_id = f"i{K}-o{N}-g{group_size}-{provider}" + _log_temp(request.config, f"{test_id}:pre-sleep") + time.sleep(COOL_DOWN_DELAY_S) + _log_temp(request.config, f"{test_id}:post-sleep") + + # ---- allocate weights once ---- + weights = prepare_hybrid_weights(K, N, group_size) + + # ---- iterate batch sizes ---- + failures: list[str] = [] + xpass_list: list[str] = [] + exit_first = getattr(request.config.option, "exitfirst", False) + + for bs in BATCH_SIZES: + # Look up baseline + bl_entry: dict[str, Any] | None = None + if prov_data is not None: + bl_entry = _find_baseline(prov_data, bs) + + # ---- skip annotations ---- + if bl_entry is not None and "skip" in bl_entry: + reason = bl_entry["skip"] + print(f" batch_size={bs}: SKIP ({reason})") + if measure_mode: + _record_skip(request.config, gcn_arch, shape, provider, bs, reason) + continue + + # ---- intermittent handling ---- + is_intermittent = bl_entry is not None and bl_entry.get("intermittent", False) + if is_intermittent and not intermittent_mode: + print(f" batch_size={bs}: SKIP (intermittent)") + continue + + # ---- measure ---- + _log_temp(request.config, f"{test_id}:bs{bs}:pre") + tflops = measure_tflops(bs, weights, K, N, group_size, provider) + post_temp = _log_temp(request.config, f"{test_id}:bs{bs}:post") + temp_tag = f" [{post_temp:.0f}\u00b0C]" + + if measure_mode: + # Carry forward annotations + annot: dict[str, Any] = {} + if bl_entry is not None: + for key in ("expected_failure", "intermittent"): + if key in bl_entry: + annot[key] = bl_entry[key] + _record_measurement( + request.config, gcn_arch, shape, provider, bs, tflops, annot + ) + print(f" batch_size={bs}: {tflops:.2f} TFLOP/s (measured){temp_tag}") + continue + + if bl_entry is None: + print( + f" batch_size={bs}: {tflops:.2f} TFLOP/s " + f"(no golden value, skipping assertion){temp_tag}" + ) + continue + + expected = bl_entry["tflops"] + + # ---- sanity check golden value ---- + assert expected > 0 and math.isfinite(expected), ( + f"Golden tflops={expected} for batch_size={bs} is invalid. " + "Run --measure-baselines to populate." + ) + + # ---- tolerance band ---- + lo = expected * (1 + TFLOPS_TOLERANCE_PCT[0] / 100) + hi = expected * (1 + TFLOPS_TOLERANCE_PCT[1] / 100) + in_band = lo <= tflops <= hi + + has_xfail = "expected_failure" in bl_entry + + if has_xfail: + if in_band: + xpass_list.append( + f" batch_size={bs}: {tflops:.2f} TFLOP/s is now within " + f"band [{lo:.2f}, {hi:.2f}] -- remove expected_failure " + f"annotation: {bl_entry['expected_failure']}" + ) + print( + f" batch_size={bs}: {tflops:.2f} TFLOP/s " + f"(expected {expected:.2f} " + f"+ {TFLOPS_TOLERANCE_PCT}%) XPASS{temp_tag}" + ) + else: + print( + f" batch_size={bs}: {tflops:.2f} TFLOP/s " + f"(expected {expected:.2f} " + f"+ {TFLOPS_TOLERANCE_PCT}%) " + f"XFAIL: {bl_entry['expected_failure']}{temp_tag}" + ) + continue + + if in_band: + print( + f" batch_size={bs}: {tflops:.2f} TFLOP/s " + f"(expected {expected:.2f} " + f"+ {TFLOPS_TOLERANCE_PCT}%) PASS{temp_tag}" + ) + else: + direction = "regression" if tflops < lo else "improvement" + delta_pct = (tflops - expected) / expected * 100 + msg = ( + f" batch_size={bs}: {tflops:.2f} TFLOP/s " + f"(expected {expected:.2f} " + f"+ {TFLOPS_TOLERANCE_PCT}%) " + f"FAIL ({direction}, {delta_pct:+.1f}%){temp_tag}" + ) + print(msg) + if exit_first: + if direction == "regression": + pytest.fail( + f"Performance regression at batch_size={bs}: " + f"{tflops:.2f} < {lo:.2f} TFLOP/s. " + "Run --measure-baselines to update." + ) + else: + pytest.fail( + f"Performance improved at batch_size={bs}: " + f"{tflops:.2f} > {hi:.2f} TFLOP/s. " + "Run --measure-baselines to update baselines." + ) + failures.append(msg) + + # ---- report xpass ---- + if xpass_list: + raise AssertionError( + "Unexpected passes -- remove expected_failure annotations:\n" + + "\n".join(xpass_list) + ) + + # ---- report failures ---- + if failures: + raise AssertionError( + f"{len(failures)} batch size(s) out of tolerance band. " + "Run --measure-baselines to update.\n" + "\n".join(failures) + )