diff --git a/.github/workflows/build-rocm-wheels.yml b/.github/workflows/build-rocm-wheels.yml
index edce8bf38213..46012b99614c 100644
--- a/.github/workflows/build-rocm-wheels.yml
+++ b/.github/workflows/build-rocm-wheels.yml
@@ -15,12 +15,12 @@ on:
         required: false
       rocm_arch:
         description: 'ROCm architecture (e.g., gfx1151, gfx1150;gfx1151)'
-        default: 'gfx1103;gfx1150;gfx1151'
+        default: 'gfx1151'  # TODO: restore to 'gfx1103;gfx1150;gfx1151' before merging
         required: false
 
 env:
   PYTORCH_INDEX_URL: ${{ github.event.inputs.pytorch_index || 'https://rocm.nightlies.amd.com/v2-staging/gfx1151' }}
-  PYTORCH_ROCM_ARCH: ${{ github.event.inputs.rocm_arch || 'gfx1103;gfx1150;gfx1151' }}
+  PYTORCH_ROCM_ARCH: ${{ github.event.inputs.rocm_arch || 'gfx1151' }}  # TODO: restore to 'gfx1103;gfx1150;gfx1151' before merging
   CI_IMAGE: ghcr.io/rocm/vllm/gfx11-ci:latest
 
 jobs:
@@ -186,6 +186,22 @@ jobs:
           else
             echo "amd-smi not found"
           fi
+          echo "=== GPU clocks, power profile, and temperature ==="
+          for card in /sys/class/drm/card[0-9]/device /sys/class/drm/card[0-9][0-9]/device; do
+            [ -d "$card" ] || continue
+            echo "--- ${card} ---"
+            for f in "$card"/hwmon/hwmon*/temp*_input; do
+              [ -f "$f" ] || continue
+              echo "${f#"$card"/}: $(( $(cat "$f") / 1000 )) °C"
+            done
+            f=power_dpm_force_performance_level
+            [ -f "$card/$f" ] && echo "$f: $(cat "$card/$f")"
+            for f in pp_dpm_mclk pp_dpm_sclk pp_power_profile_mode; do
+              [ -f "$card/$f" ] || continue
+              echo "$f:"
+              head -20 "$card/$f" | sed 's/^/  /'
+            done
+          done
 
       - name: Install wheel and test dependencies
         run: |
@@ -233,7 +249,8 @@ jobs:
             tests/kernels/quantization/test_hybrid_w4a16_triton.py \
             tests/kernels/quantization/test_rocm_compressed_tensors_w4a16.py \
             tests/kernels/quantization/test_rocm_skinny_gemms.py \
-            tests/quantization/test_hip_w4a16_kernel.py
+            tests/quantization/test_hip_w4a16_kernel.py \
+            tests/kernels/quantization/test_hybrid_w4a16_perf.py
 
   upload-wheel:
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 134bbc5cc893..adcb9f4b191f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -245,3 +245,6 @@ vllm/grpc/vllm_engine_pb2.pyi
 
 # Ignore generated cpu headers 
 csrc/cpu/cpu_attn_dispatch_generated.h
+
+# Measured performance baselines (never committed)
+tests/kernels/quantization/measured/
diff --git a/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py b/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py
deleted file mode 100644
index 8336c1e7c94d..000000000000
--- a/benchmarks/kernels/benchmark_hybrid_w4a16_gemm.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark the HybridW4A16LinearKernel across decode and prefill shapes.
-
-Usage:
-    python benchmark_int4_gemm.py
-    python benchmark_int4_gemm.py --models Qwen/Qwen3-4B
-    python benchmark_int4_gemm.py --group-size 128
-"""
-
-import argparse
-import copy
-import itertools
-import os
-
-import torch
-
-from vllm.triton_utils import triton
-
-# ---------------------------------------------------------------------------
-# Weight shapes: [K, N], TP_SPLIT_DIM
-# ---------------------------------------------------------------------------
-WEIGHT_SHAPES = {
-    "Qwen/Qwen3-4B": [
-        ([2560, 3840], 1),  # qkv_proj
-        ([2560, 2560], 0),  # o_proj
-        ([2560, 19456], 1),  # gate_up_proj
-        ([9728, 2560], 0),  # down_proj
-    ],
-    "Qwen/Qwen2.5-7B-Instruct": [
-        ([3584, 4608], 1),
-        ([3584, 3584], 0),
-        ([3584, 37888], 1),
-        ([18944, 3584], 0),
-    ],
-}
-
-
-# ---------------------------------------------------------------------------
-# Weight packing
-# ---------------------------------------------------------------------------
-def prepare_hybrid_weights(K, N, group_size, device="cuda"):
-    """Create random weights for benchmarking.
-
-    Returns (w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32).
-    """
-    num_groups = K // group_size
-
-    # Random packed weights — actual values don't matter for throughput
-    w_q_skinny_i32 = torch.randint(
-        0, 2**31, (N, K // 8), dtype=torch.int32, device=device
-    )
-    w_q_skinny = w_q_skinny_i32.view(torch.int8).contiguous()
-    w_s_skinny = torch.randn(N, num_groups, dtype=torch.float16, device=device) * 0.01
-
-    # Raw per-group zero-points for asymmetric benchmarks
-    w_zp = torch.randint(0, 16, (N, num_groups), dtype=torch.int32, device=device).to(
-        torch.float16
-    )
-
-    # FP16 baseline for F.linear
-    w_fp16 = torch.randn(N, K, dtype=torch.float16, device=device) * 0.01
-
-    return w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32, w_zp
-
-
-# ---------------------------------------------------------------------------
-# Benchmark
-# ---------------------------------------------------------------------------
-PROVIDERS = ["torch-fp16", "hybrid-w4a16", "hybrid-w4a16-zp"]
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
-        x_log=False,
-        line_arg="provider",
-        line_vals=PROVIDERS,
-        line_names=PROVIDERS,
-        ylabel="TFLOP/s (larger is better)",
-        plot_name="FP16 vs Hybrid W4A16",
-        args={},
-    )
-)
-def benchmark(batch_size, provider, N, K, group_size, weights):
-    M = batch_size
-    device = "cuda"
-    dtype = torch.float16
-    a = torch.randn((M, K), device=device, dtype=dtype)
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch-fp16":
-        w_fp16 = weights["w_fp16"]
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: torch.nn.functional.linear(a, w_fp16),
-            quantiles=quantiles,
-        )
-    elif provider in ("hybrid-w4a16", "hybrid-w4a16-zp"):
-        from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import (
-            _hybrid_w4a16_apply_impl,
-        )
-        from vllm.utils.platform_utils import num_compute_units
-
-        w = weights
-        cu_count = num_compute_units()
-        use_zp = provider == "hybrid-w4a16-zp"
-
-        def run():
-            return _hybrid_w4a16_apply_impl(
-                a,
-                w["w_q_skinny"],
-                w["w_s_skinny"],
-                w["w_q_skinny_i32"],
-                w["w_zp"] if use_zp else None,
-                None,  # bias
-                cu_count,
-                group_size,
-            )
-
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            run,
-            quantiles=quantiles,
-        )
-    else:
-        return 0.0, 0.0, 0.0
-
-    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
-    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
-
-
-def prepare_shapes(args):
-    KN_model_names = []
-    for model, tp_size in itertools.product(args.models, args.tp_sizes):
-        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_dim] //= tp_size
-            KN.append(model)
-            KN_model_names.append(KN)
-    return KN_model_names
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark HybridW4A16LinearKernel")
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=["Qwen/Qwen3-4B"],
-        choices=list(WEIGHT_SHAPES.keys()),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
-    parser.add_argument("--group-size", type=int, default=128)
-    parser.add_argument("--save-path", type=str, default=None)
-    args = parser.parse_args()
-
-    for K, N, model in prepare_shapes(args):
-        group_size = args.group_size
-        print(f"\n{'=' * 70}")
-        print(f"{model}, N={N} K={K}, group_size={group_size}")
-        print(f"{'=' * 70}")
-
-        w_q_skinny, w_s_skinny, w_fp16, w_q_skinny_i32, w_zp = prepare_hybrid_weights(
-            K, N, group_size
-        )
-
-        weights = {
-            "w_q_skinny": w_q_skinny,
-            "w_s_skinny": w_s_skinny,
-            "w_fp16": w_fp16,
-            "w_q_skinny_i32": w_q_skinny_i32,
-            "w_zp": w_zp,
-        }
-
-        save_path = args.save_path or f"bench_int4_res_n{N}_k{K}"
-        os.makedirs(save_path, exist_ok=True)
-        benchmark.run(
-            print_data=True,
-            show_plots=False,
-            save_path=save_path,
-            N=N,
-            K=K,
-            group_size=group_size,
-            weights=weights,
-        )
-
-    print("\nBenchmark finished!")
diff --git a/pyproject.toml b/pyproject.toml
index f37d5ce02375..51818d98a6e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,6 +109,7 @@ markers = [
     "split: run this test as part of a split",
     "distributed: run this test only in distributed GPU tests",
     "optional: optional tests that are automatically skipped, include --optional to run them",
+    "benchmark: performance regression tests",
 ]
 
 [tool.ty.src]
diff --git a/tests/conftest.py b/tests/conftest.py
index 49400361c229..6c3b676ca0ee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1439,6 +1439,18 @@ def pytest_addoption(parser):
         default=1000 + secrets.randbelow(9000),
         help="random seed for tests that opt in",
     )
+    parser.addoption(
+        "--measure-baselines",
+        action="store_true",
+        default=False,
+        help="record performance measurements instead of asserting",
+    )
+    parser.addoption(
+        "--intermittent",
+        action="store_true",
+        default=False,
+        help="include intermittent (noisy) benchmark cases",
+    )
 
 
 def pytest_report_header(config):
diff --git a/tests/kernels/quantization/conftest.py b/tests/kernels/quantization/conftest.py
new file mode 100644
index 000000000000..1feba0e05894
--- /dev/null
+++ b/tests/kernels/quantization/conftest.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""conftest for quantization kernel tests.
+
+Adds session-finish hook that writes measured results to the ``measured/``
+directory when ``--measure-baselines`` is set.
+"""
+
+from __future__ import annotations
+
+import json
+import pathlib
+import shutil
+
+import pytest
+
+_HERE = pathlib.Path(__file__).resolve().parent
+_MEASURED_DIR = _HERE / "measured"
+
+# Attribute name on config for the measured-results dict.
+_ATTR = "_hybrid_w4a16_measured_results"
+_TEMP_ATTR = "_hybrid_w4a16_temp_log"
+
+
+def get_measured_results(config: pytest.Config) -> dict[str, list[dict]]:
+    """Return (creating if needed) the session-scoped measurement dict."""
+    d = getattr(config, _ATTR, None)
+    if d is None:
+        d = {}
+        setattr(config, _ATTR, d)
+    return d
+
+
+def get_temp_log(config: pytest.Config) -> list[tuple[float, str, float]]:
+    """Return (creating if needed) the session-scoped temperature log."""
+    log = getattr(config, _TEMP_ATTR, None)
+    if log is None:
+        log = []
+        setattr(config, _TEMP_ATTR, log)
+    return log
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    if config.getoption("--measure-baselines", default=False):
+        # Wipe previous measured/ so aborted runs don't leave stale data.
+        if _MEASURED_DIR.exists():
+            shutil.rmtree(_MEASURED_DIR)
+        _MEASURED_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    # Write temperature log if path was set
+    import os
+
+    temp_log_path = os.environ.get("TEMP_LOG_PATH", "")
+    if temp_log_path:
+        log = get_temp_log(session.config)
+        if log:
+            t0 = log[0][0]
+            with open(temp_log_path, "w") as f:
+                f.write("elapsed_s,label,temp_C\n")
+                for ts, label, temp in log:
+                    f.write(f"{ts - t0:.2f},{label},{temp:.1f}\n")
+
+    if not session.config.getoption("--measure-baselines", default=False):
+        return
+
+    results = get_measured_results(session.config)
+    if not results:
+        return
+
+    _MEASURED_DIR.mkdir(parents=True, exist_ok=True)
+
+    for gpu, shapes in results.items():
+        out_path = _MEASURED_DIR / f"hybrid_w4a16_{gpu}.json"
+        data = {"gpu": gpu, "shapes": shapes}
+        out_path.write_text(json.dumps(data, indent=2) + "\n")
diff --git a/tests/kernels/quantization/golden/README.md b/tests/kernels/quantization/golden/README.md
new file mode 100644
index 000000000000..e3dfee5d085d
--- /dev/null
+++ b/tests/kernels/quantization/golden/README.md
@@ -0,0 +1,119 @@
+# Golden Baselines for Kernel Performance Regression Tests
+
+ This directory contains per-GPU JSON files with golden TFLOP/s baselines for
+ the hybrid W4A16 kernel.  The test in `test_hybrid_w4a16_perf.py` compares
+ measured performance against these values using a two-sided tolerance band.
+
+## JSON Schema
+
+ ```json
+ {
+   "gpu": "<gcnArch prefix, e.g. gfx1151>",
+   "shapes": [
+     {
+       "in_features": 2560,
+       "out_features": 3840,
+       "group_size": 128,
+       "comment": "Qwen3-4B qkv_proj",
+       "skip": "(optional) reason to skip entire shape",
+       "providers": [
+         {
+           "provider": "hybrid-w4a16",
+           "skip": "(optional) reason to skip this provider",
+           "baselines": [
+             {
+               "batch_size": 1,
+               "tflops": 5.12,
+               "expected_failure": "(optional) reason this is expected to fail",
+               "intermittent": false,
+               "skip": "(optional) reason to skip this batch size"
+             }
+           ]
+         }
+       ]
+     }
+   ]
+ }
+ ```
+
+### Fields
+
+ | Field | Level | Description |
+ | --- | --- | --- |
+ | `gpu` | top | GCN architecture prefix (e.g. `gfx1151`). Matched against `_GCN_ARCH` at runtime. |
+ | `in_features` | shape | K dimension of the GEMM. |
+ | `out_features` | shape | N dimension of the GEMM. |
+ | `group_size` | shape | Quantization group size (typically 128). |
+ | `comment` | shape | Human-readable label (model name + layer). |
+ | `skip` | shape/provider/baseline | When present, the item is skipped. Value is the reason string. `tflops` is not required when `skip` is set. |
+ | `provider` | provider | Kernel variant: `hybrid-w4a16` or `hybrid-w4a16-zp`. |
+ | `tflops` | baseline | Golden TFLOP/s value for this batch size. |
+ | `batch_size` | baseline | M dimension (number of tokens). |
+ | `expected_failure` | baseline | When present, out-of-band results are silently accepted. If the measurement lands *inside* the band, the test errors (unexpected pass). |
+ | `intermittent` | baseline | When `true`, this batch size is skipped unless `--intermittent` is passed. |
+
+### Constraints
+
+- Shapes in the JSON must be a subset of `SHAPES` in
+   `test_hybrid_w4a16_perf.py`.  Extra shapes in the JSON cause a collection
+   error.
+- Batch sizes in the JSON must be a subset of `BATCH_SIZES` in the test file.
+   Extra batch sizes cause a collection error.
+- Shapes are sorted by `(in_features, out_features, group_size)` for clean
+   diffs.
+
+## Adding a New GPU Target
+
+ 1. Run `--measure-baselines` on the new GPU.  No golden file needs to exist
+    first -- the test bootstraps from `SHAPES x PROVIDERS x BATCH_SIZES`:
+
+    ```bash
+    .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \
+        --measure-baselines -s
+    ```
+
+ 2. Inspect `measured/hybrid_w4a16_<gpu>.json`.
+
+ 3. Copy the measured file here:
+
+    ```bash
+    cp tests/kernels/quantization/measured/hybrid_w4a16_<gpu>.json \
+       tests/kernels/quantization/golden/
+    ```
+
+ 4. Commit.  No code changes needed -- the test auto-discovers JSON files by
+    GPU match.
+
+## Adding New Shapes
+
+ 1. Add the shape to `SHAPES` in `test_hybrid_w4a16_perf.py`.
+ 2. Run `--measure-baselines` on each GPU that has a golden file.
+ 3. Diff `golden/` vs `measured/`, copy updated files, commit.
+
+## Contributor Workflow
+
+### No performance impact expected
+
+ Push the change.  CI runs the test.  If it passes, done.
+
+### Intentional performance change
+
+ ```bash
+ # On a matching GPU:
+ .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \
+     --measure-baselines -s
+ 
+ # Compare:
+ diff -r tests/kernels/quantization/golden/ \
+        tests/kernels/quantization/measured/
+ 
+ # Update golden values:
+ cp tests/kernels/quantization/measured/hybrid_w4a16_gfx1151.json \
+    tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json
+ 
+ # Commit JSON alongside the kernel change:
+ git add tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json
+ git commit
+ ```
+
+ The `measured/` directory is in `.gitignore` and should never be committed.
diff --git a/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json
new file mode 100644
index 000000000000..ad09bf67d162
--- /dev/null
+++ b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json
@@ -0,0 +1,2079 @@
+{
+  "gpu": "gfx1151",
+  "shapes": [
+    {
+      "in_features": 2048,
+      "out_features": 2048,
+      "group_size": 128,
+      "comment": "gemma-2b o_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.3956
+            },
+            {
+              "batch_size": 2,
+              "tflops": 2.1364
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.6879
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.4325
+            },
+            {
+              "batch_size": 16,
+              "tflops": 6.7379
+            },
+            {
+              "batch_size": 32,
+              "tflops": 13.0322
+            },
+            {
+              "batch_size": 64,
+              "tflops": 12.7756
+            },
+            {
+              "batch_size": 128,
+              "tflops": 20.2416
+            },
+            {
+              "batch_size": 256,
+              "tflops": 17.5314
+            },
+            {
+              "batch_size": 512,
+              "tflops": 18.471
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 18.5844
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 23.7047
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 23.3858
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.2443
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.905
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.5669
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.4513
+            },
+            {
+              "batch_size": 16,
+              "tflops": 6.8081
+            },
+            {
+              "batch_size": 32,
+              "tflops": 13.0672
+            },
+            {
+              "batch_size": 64,
+              "tflops": 12.2089
+            },
+            {
+              "batch_size": 128,
+              "tflops": 19.5104
+            },
+            {
+              "batch_size": 256,
+              "tflops": 17.2851
+            },
+            {
+              "batch_size": 512,
+              "tflops": 17.2427
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 18.065
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 22.2069
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 22.3827
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 2048,
+      "out_features": 2560,
+      "group_size": 128,
+      "comment": "gemma-2b qkv_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.2669
+            },
+            {
+              "batch_size": 2,
+              "tflops": 2.1339
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.0023
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.875
+            },
+            {
+              "batch_size": 16,
+              "tflops": 7.5271
+            },
+            {
+              "batch_size": 32,
+              "tflops": 14.0853
+            },
+            {
+              "batch_size": 64,
+              "tflops": 13.6546
+            },
+            {
+              "batch_size": 128,
+              "tflops": 21.2751
+            },
+            {
+              "batch_size": 256,
+              "tflops": 17.3688
+            },
+            {
+              "batch_size": 512,
+              "tflops": 17.7514
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 19.0102
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.1754
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.3621
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.1927
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.9083
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.8705
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.7228
+            },
+            {
+              "batch_size": 16,
+              "tflops": 7.3855
+            },
+            {
+              "batch_size": 32,
+              "tflops": 13.791
+            },
+            {
+              "batch_size": 64,
+              "tflops": 13.0059
+            },
+            {
+              "batch_size": 128,
+              "tflops": 21.3741
+            },
+            {
+              "batch_size": 256,
+              "tflops": 16.9618
+            },
+            {
+              "batch_size": 512,
+              "tflops": 17.3823
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 18.4831
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 23.381
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 23.9851
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 2048,
+      "out_features": 32768,
+      "group_size": 128,
+      "comment": "gemma-2b gate_up_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9247
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7998
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.4455
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.4411
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.6629
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.4144
+            },
+            {
+              "batch_size": 64,
+              "tflops": 9.2579
+            },
+            {
+              "batch_size": 128,
+              "tflops": 21.4924
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.5516
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.4765
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.4269
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 21.9715
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 19.8008
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.8996
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7079
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.2741
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.1717
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.0755
+            },
+            {
+              "batch_size": 32,
+              "tflops": 15.5206
+            },
+            {
+              "batch_size": 64,
+              "tflops": 7.4691
+            },
+            {
+              "batch_size": 128,
+              "tflops": 20.8705
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.5421
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.4748
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.6075
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 22.0469
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 19.8537
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 2560,
+      "out_features": 2560,
+      "group_size": 128,
+      "comment": "Qwen3-4B o_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.0679
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.9995
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.161
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.3066
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.146
+            },
+            {
+              "batch_size": 32,
+              "tflops": 15.5181
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.0524
+            },
+            {
+              "batch_size": 128,
+              "tflops": 20.2958
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.0882
+            },
+            {
+              "batch_size": 512,
+              "tflops": 23.9008
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.4399
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.7905
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.621
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9698
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.8477
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.0152
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.3119
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.198
+            },
+            {
+              "batch_size": 32,
+              "tflops": 15.6252
+            },
+            {
+              "batch_size": 64,
+              "tflops": 16.0473
+            },
+            {
+              "batch_size": 128,
+              "tflops": 19.2885
+            },
+            {
+              "batch_size": 256,
+              "tflops": 20.8525
+            },
+            {
+              "batch_size": 512,
+              "tflops": 23.1322
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.1483
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.2402
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.7659
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 2560,
+      "out_features": 3840,
+      "group_size": 128,
+      "comment": "Qwen3-4B qkv_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9975
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.8936
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.8242
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.4486
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.6678
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.6232
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.7952
+            },
+            {
+              "batch_size": 128,
+              "tflops": 23.7005
+            },
+            {
+              "batch_size": 256,
+              "tflops": 24.3613
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.7255
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 25.0983
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.7907
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 26.2186
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9196
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7588
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.4896
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.38
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.4495
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.3339
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.2289
+            },
+            {
+              "batch_size": 128,
+              "tflops": 23.5615
+            },
+            {
+              "batch_size": 256,
+              "tflops": 23.8283
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.3179
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.2931
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.3921
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 25.9323
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 2560,
+      "out_features": 19456,
+      "group_size": 128,
+      "comment": "Qwen3-4B gate_up_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.894
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7246
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.3242
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.5236
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.645
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.7812
+            },
+            {
+              "batch_size": 64,
+              "tflops": 21.7332
+            },
+            {
+              "batch_size": 128,
+              "tflops": 24.898
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.4871
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.5385
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 25.4898
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.2532
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.7898
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.8544
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.6239
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.1761
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.4362
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.4064
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.3875
+            },
+            {
+              "batch_size": 64,
+              "tflops": 15.2236
+            },
+            {
+              "batch_size": 128,
+              "tflops": 24.6912
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.3165
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.2057
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.8581
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.9453
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.8502
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 3584,
+      "out_features": 3584,
+      "group_size": 128,
+      "comment": "Qwen2.5-7B o_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9845
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.8544
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.7296
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.2831
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.3524
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.109
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.7894
+            },
+            {
+              "batch_size": 128,
+              "tflops": 18.3642
+            },
+            {
+              "batch_size": 256,
+              "tflops": 23.222
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.6664
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.1986
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.5559
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 25.6219
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9254
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7193
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.3911
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.1332
+            },
+            {
+              "batch_size": 16,
+              "tflops": 7.9547
+            },
+            {
+              "batch_size": 32,
+              "tflops": 15.5994
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.3155
+            },
+            {
+              "batch_size": 128,
+              "tflops": 17.7375
+            },
+            {
+              "batch_size": 256,
+              "tflops": 21.7902
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.1015
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.198
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.9341
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 25.4049
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 3584,
+      "out_features": 4608,
+      "group_size": 128,
+      "comment": "Qwen2.5-7B qkv_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9589
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.8141
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.6611
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.2792
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.2917
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.0026
+            },
+            {
+              "batch_size": 64,
+              "tflops": 18.9172
+            },
+            {
+              "batch_size": 128,
+              "tflops": 23.4304
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.9194
+            },
+            {
+              "batch_size": 512,
+              "tflops": 22.9634
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.9841
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.1618
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.8986
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9077
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.6596
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.3327
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.1092
+            },
+            {
+              "batch_size": 16,
+              "tflops": 7.2905
+            },
+            {
+              "batch_size": 32,
+              "tflops": 14.346
+            },
+            {
+              "batch_size": 64,
+              "tflops": 17.8276
+            },
+            {
+              "batch_size": 128,
+              "tflops": 22.2431
+            },
+            {
+              "batch_size": 256,
+              "tflops": 21.614
+            },
+            {
+              "batch_size": 512,
+              "tflops": 22.1398
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 22.8507
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.3894
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.473
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 3584,
+      "out_features": 37888,
+      "group_size": 128,
+      "comment": "Qwen2.5-7B gate_up_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9494
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7944
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.5244
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.5967
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.8868
+            },
+            {
+              "batch_size": 32,
+              "tflops": 17.446
+            },
+            {
+              "batch_size": 64,
+              "tflops": 21.4765
+            },
+            {
+              "batch_size": 128,
+              "tflops": 24.5377
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.5151
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.6997
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 25.4979
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 25.4565
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 25.6372
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9444
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7198
+            },
+            {
+              "batch_size": 4,
+              "tflops": 3.3862
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.445
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.5539
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.8321
+            },
+            {
+              "batch_size": 64,
+              "tflops": 14.6204
+            },
+            {
+              "batch_size": 128,
+              "tflops": 24.8905
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.3239
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.0395
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.8458
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.881
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 25.3841
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 8192,
+      "out_features": 512,
+      "group_size": 128,
+      "comment": "L2 2MiB at",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.1894
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7129
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.5526
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.6675
+            },
+            {
+              "batch_size": 16,
+              "tflops": 3.1527
+            },
+            {
+              "batch_size": 32,
+              "tflops": 5.4616
+            },
+            {
+              "batch_size": 64,
+              "tflops": 3.5551
+            },
+            {
+              "batch_size": 128,
+              "tflops": 9.7177
+            },
+            {
+              "batch_size": 256,
+              "tflops": 16.9947
+            },
+            {
+              "batch_size": 512,
+              "tflops": 19.7511
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.1965
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 17.2034
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 14.4072
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9803
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.4679
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.4295
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.5168
+            },
+            {
+              "batch_size": 16,
+              "tflops": 2.9111
+            },
+            {
+              "batch_size": 32,
+              "tflops": 5.0758
+            },
+            {
+              "batch_size": 64,
+              "tflops": 3.3141
+            },
+            {
+              "batch_size": 128,
+              "tflops": 9.3917
+            },
+            {
+              "batch_size": 256,
+              "tflops": 16.8781
+            },
+            {
+              "batch_size": 512,
+              "tflops": 19.6022
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.0311
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 16.2197
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 12.3815
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 8320,
+      "out_features": 512,
+      "group_size": 128,
+      "comment": "L2 2MiB above",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.1795
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.794
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.8467
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.6837
+            },
+            {
+              "batch_size": 16,
+              "tflops": 3.3689
+            },
+            {
+              "batch_size": 32,
+              "tflops": 6.6263
+            },
+            {
+              "batch_size": 64,
+              "tflops": 4.5296
+            },
+            {
+              "batch_size": 128,
+              "tflops": 10.7239
+            },
+            {
+              "batch_size": 256,
+              "tflops": 17.3085
+            },
+            {
+              "batch_size": 512,
+              "tflops": 19.827
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.4261
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 20.3098
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 20.7601
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 1.0345
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.6052
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.7527
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.4949
+            },
+            {
+              "batch_size": 16,
+              "tflops": 2.9586
+            },
+            {
+              "batch_size": 32,
+              "tflops": 5.8479
+            },
+            {
+              "batch_size": 64,
+              "tflops": 4.2138
+            },
+            {
+              "batch_size": 128,
+              "tflops": 10.9798
+            },
+            {
+              "batch_size": 256,
+              "tflops": 16.9799
+            },
+            {
+              "batch_size": 512,
+              "tflops": 19.4053
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.9872
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 20.3439
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 21.2659
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 9728,
+      "out_features": 2560,
+      "group_size": 128,
+      "comment": "Qwen3-4B down_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9417
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.8368
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.2093
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.2852
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.5736
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.6779
+            },
+            {
+              "batch_size": 64,
+              "tflops": 16.5347
+            },
+            {
+              "batch_size": 128,
+              "tflops": 24.0537
+            },
+            {
+              "batch_size": 256,
+              "tflops": 25.0934
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.5208
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.8518
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 24.6247
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 24.7359
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.9087
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.7045
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.2514
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.3491
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.6603
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.8873
+            },
+            {
+              "batch_size": 64,
+              "tflops": 15.7646
+            },
+            {
+              "batch_size": 128,
+              "tflops": 23.9267
+            },
+            {
+              "batch_size": 256,
+              "tflops": 24.96
+            },
+            {
+              "batch_size": 512,
+              "tflops": 25.5181
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.9927
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 23.9842
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 23.4345
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 16384,
+      "out_features": 2048,
+      "group_size": 128,
+      "comment": "gemma-2b down_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.6622
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.2574
+            },
+            {
+              "batch_size": 4,
+              "tflops": 1.7651
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.4955
+            },
+            {
+              "batch_size": 16,
+              "tflops": 6.9043
+            },
+            {
+              "batch_size": 32,
+              "tflops": 13.3775
+            },
+            {
+              "batch_size": 64,
+              "tflops": 12.3375
+            },
+            {
+              "batch_size": 128,
+              "tflops": 19.4665
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.2801
+            },
+            {
+              "batch_size": 512,
+              "tflops": 20.5712
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 22.1274
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 14.5177
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 15.7439
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.619
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.1881
+            },
+            {
+              "batch_size": 4,
+              "tflops": 1.6908
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.3439
+            },
+            {
+              "batch_size": 16,
+              "tflops": 6.5966
+            },
+            {
+              "batch_size": 32,
+              "tflops": 12.7721
+            },
+            {
+              "batch_size": 64,
+              "tflops": 10.3608
+            },
+            {
+              "batch_size": 128,
+              "tflops": 17.3985
+            },
+            {
+              "batch_size": 256,
+              "tflops": 21.4558
+            },
+            {
+              "batch_size": 512,
+              "tflops": 19.6548
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 21.6144
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 13.1811
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 14.2666
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 18944,
+      "out_features": 3584,
+      "group_size": 128,
+      "comment": "Qwen2.5-7B down_proj",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.8843
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.1249
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.2302
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.4101
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.4345
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.6272
+            },
+            {
+              "batch_size": 64,
+              "tflops": 18.1639
+            },
+            {
+              "batch_size": 128,
+              "tflops": 17.9899
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.8929
+            },
+            {
+              "batch_size": 512,
+              "tflops": 24.0236
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 23.892
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 23.0834
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 23.6381
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.8621
+            },
+            {
+              "batch_size": 2,
+              "tflops": 1.0963
+            },
+            {
+              "batch_size": 4,
+              "tflops": 2.1254
+            },
+            {
+              "batch_size": 8,
+              "tflops": 4.1472
+            },
+            {
+              "batch_size": 16,
+              "tflops": 8.2929
+            },
+            {
+              "batch_size": 32,
+              "tflops": 16.0558
+            },
+            {
+              "batch_size": 64,
+              "tflops": 16.2587
+            },
+            {
+              "batch_size": 128,
+              "tflops": 17.5202
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.6163
+            },
+            {
+              "batch_size": 512,
+              "tflops": 24.0904
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 24.3297
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 22.3802
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 22.4165
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 38912,
+      "out_features": 2048,
+      "group_size": 128,
+      "comment": "hf-kernel, K_packed % 4096 != 0",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.4265
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.8436
+            },
+            {
+              "batch_size": 4,
+              "tflops": 1.6834
+            },
+            {
+              "batch_size": 8,
+              "tflops": 3.1377
+            },
+            {
+              "batch_size": 16,
+              "tflops": 5.5249
+            },
+            {
+              "batch_size": 32,
+              "tflops": 9.3102
+            },
+            {
+              "batch_size": 64,
+              "tflops": 6.8117
+            },
+            {
+              "batch_size": 128,
+              "tflops": 18.2063
+            },
+            {
+              "batch_size": 256,
+              "tflops": 23.2781
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.9953
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 20.45
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 15.5919
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 16.4851
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.3541
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.6942
+            },
+            {
+              "batch_size": 4,
+              "tflops": 1.3525
+            },
+            {
+              "batch_size": 8,
+              "tflops": 2.558
+            },
+            {
+              "batch_size": 16,
+              "tflops": 4.5556
+            },
+            {
+              "batch_size": 32,
+              "tflops": 7.6537
+            },
+            {
+              "batch_size": 64,
+              "tflops": 6.5258
+            },
+            {
+              "batch_size": 128,
+              "tflops": 17.52
+            },
+            {
+              "batch_size": 256,
+              "tflops": 22.7488
+            },
+            {
+              "batch_size": 512,
+              "tflops": 21.7849
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 19.1522
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 13.7515
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 14.5751
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 49152,
+      "out_features": 2048,
+      "group_size": 128,
+      "comment": "hf-kernel, K_packed % 4096 == 0",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.2223
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.4405
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.8804
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.7199
+            },
+            {
+              "batch_size": 16,
+              "tflops": 3.3486
+            },
+            {
+              "batch_size": 32,
+              "tflops": 6.1204
+            },
+            {
+              "batch_size": 64,
+              "tflops": 5.0983
+            },
+            {
+              "batch_size": 128,
+              "tflops": 13.5035
+            },
+            {
+              "batch_size": 256,
+              "tflops": 17.7353
+            },
+            {
+              "batch_size": 512,
+              "tflops": 17.5016
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 19.0624
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 13.322
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 14.6727
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.1883
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.3725
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.7315
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.3876
+            },
+            {
+              "batch_size": 16,
+              "tflops": 2.6876
+            },
+            {
+              "batch_size": 32,
+              "tflops": 4.7003
+            },
+            {
+              "batch_size": 64,
+              "tflops": 4.3578
+            },
+            {
+              "batch_size": 128,
+              "tflops": 12.7461
+            },
+            {
+              "batch_size": 256,
+              "tflops": 16.4239
+            },
+            {
+              "batch_size": 512,
+              "tflops": 16.4054
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 17.5356
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 12.2891
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 13.2295
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "in_features": 49152,
+      "out_features": 4096,
+      "group_size": 128,
+      "comment": "hf-kernel, K_packed % 4096 == 0",
+      "providers": [
+        {
+          "provider": "hybrid-w4a16",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.2614
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.5111
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.9764
+            },
+            {
+              "batch_size": 8,
+              "tflops": 2.0766
+            },
+            {
+              "batch_size": 16,
+              "tflops": 3.9968
+            },
+            {
+              "batch_size": 32,
+              "tflops": 7.3814
+            },
+            {
+              "batch_size": 64,
+              "tflops": 6.0318
+            },
+            {
+              "batch_size": 128,
+              "tflops": 9.0754
+            },
+            {
+              "batch_size": 256,
+              "tflops": 11.8871
+            },
+            {
+              "batch_size": 512,
+              "tflops": 17.5524
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 20.4928
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 14.634
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 15.3292
+            }
+          ]
+        },
+        {
+          "provider": "hybrid-w4a16-zp",
+          "baselines": [
+            {
+              "batch_size": 1,
+              "tflops": 0.1923
+            },
+            {
+              "batch_size": 2,
+              "tflops": 0.3716
+            },
+            {
+              "batch_size": 4,
+              "tflops": 0.729
+            },
+            {
+              "batch_size": 8,
+              "tflops": 1.5244
+            },
+            {
+              "batch_size": 16,
+              "tflops": 3.0343
+            },
+            {
+              "batch_size": 32,
+              "tflops": 5.8279
+            },
+            {
+              "batch_size": 64,
+              "tflops": 5.0934
+            },
+            {
+              "batch_size": 128,
+              "tflops": 9.985
+            },
+            {
+              "batch_size": 256,
+              "tflops": 11.8763
+            },
+            {
+              "batch_size": 512,
+              "tflops": 15.1345
+            },
+            {
+              "batch_size": 1024,
+              "tflops": 18.5422
+            },
+            {
+              "batch_size": 2048,
+              "tflops": 13.0673
+            },
+            {
+              "batch_size": 4096,
+              "tflops": 13.0184
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/kernels/quantization/test_hybrid_w4a16_perf.py b/tests/kernels/quantization/test_hybrid_w4a16_perf.py
new file mode 100644
index 000000000000..4e470b8ebafa
--- /dev/null
+++ b/tests/kernels/quantization/test_hybrid_w4a16_perf.py
@@ -0,0 +1,695 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Performance regression tests for the hybrid W4A16 GEMM kernel.
+
+Compares measured TFLOP/s against golden baselines stored in per-GPU JSON
+files under ``golden/``.  A two-sided tolerance band catches both regressions
+and unexpected improvements.
+
+Usage::
+
+    # rep=20 for testing, rep=50 for --measure-baselines.
+    .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\
+        -v -s
+
+    # Measure new baselines (writes to measured/, never overwrites golden/):
+    .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\
+        --measure-baselines -s
+
+    # Include noisy/intermittent cases:
+    .venv/bin/python -m pytest tests/kernels/quantization/test_hybrid_w4a16_perf.py \\
+        --intermittent -v -s
+
+Golden JSON schema is documented in ``golden/README.md``.
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import pathlib
+import time
+from typing import Any
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+# ---------------------------------------------------------------------------
+# GPU temperature reading
+# ---------------------------------------------------------------------------
+
+
+def _read_gpu_temp() -> float:
+    """Return GPU edge temperature in degrees C, or NaN if unavailable."""
+    try:
+        import amdsmi
+
+        amdsmi.amdsmi_init()
+        devices = amdsmi.amdsmi_get_processor_handles()
+        if not devices:
+            return float("nan")
+        return float(
+            amdsmi.amdsmi_get_temp_metric(
+                devices[0],
+                amdsmi.AmdSmiTemperatureType.EDGE,
+                amdsmi.AmdSmiTemperatureMetric.CURRENT,
+            )
+        )
+    except Exception:
+        return float("nan")
+
+
+def _log_temp(config: Any, label: str) -> float:
+    """Read and log temperature with a label. Returns temp in C."""
+    from tests.kernels.quantization.conftest import get_temp_log
+
+    t = _read_gpu_temp()
+    get_temp_log(config).append((time.monotonic(), label, t))
+    return t
+
+
+# ---------------------------------------------------------------------------
+# Constants -- single source of truth
+# ---------------------------------------------------------------------------
+
+SHAPES: list[dict[str, Any]] = [
+    # google/gemma-2b-AWQ
+    {
+        "in_features": 2048,
+        "out_features": 32768,
+        "group_size": 128,
+        "comment": "gemma-2b gate_up_proj",
+    },
+    {
+        "in_features": 16384,
+        "out_features": 2048,
+        "group_size": 128,
+        "comment": "gemma-2b down_proj",
+    },
+    {
+        "in_features": 2048,
+        "out_features": 2560,
+        "group_size": 128,
+        "comment": "gemma-2b qkv_proj",
+    },
+    {
+        "in_features": 2048,
+        "out_features": 2048,
+        "group_size": 128,
+        "comment": "gemma-2b o_proj",
+    },
+    # hf-kernel: shapes with K > 32768 that dispatch to wvSplitK_int4_hf_
+    # instead of wvSplitK_int4_hf_sml_ at batch=1.
+    {
+        "in_features": 38912,
+        "out_features": 2048,
+        "group_size": 128,
+        "comment": "hf-kernel, K_packed % 4096 != 0",
+    },
+    {
+        "in_features": 49152,
+        "out_features": 2048,
+        "group_size": 128,
+        "comment": "hf-kernel, K_packed % 4096 == 0",
+    },
+    {
+        "in_features": 49152,
+        "out_features": 4096,
+        "group_size": 128,
+        "comment": "hf-kernel, K_packed % 4096 == 0",
+    },
+    # Qwen/Qwen3-4B
+    {
+        "in_features": 2560,
+        "out_features": 3840,
+        "group_size": 128,
+        "comment": "Qwen3-4B qkv_proj",
+    },
+    {
+        "in_features": 2560,
+        "out_features": 2560,
+        "group_size": 128,
+        "comment": "Qwen3-4B o_proj",
+    },
+    {
+        "in_features": 2560,
+        "out_features": 19456,
+        "group_size": 128,
+        "comment": "Qwen3-4B gate_up_proj",
+    },
+    {
+        "in_features": 9728,
+        "out_features": 2560,
+        "group_size": 128,
+        "comment": "Qwen3-4B down_proj",
+    },
+    # Qwen/Qwen2.5-7B-Instruct
+    {
+        "in_features": 3584,
+        "out_features": 4608,
+        "group_size": 128,
+        "comment": "Qwen2.5-7B qkv_proj",
+    },
+    {
+        "in_features": 3584,
+        "out_features": 3584,
+        "group_size": 128,
+        "comment": "Qwen2.5-7B o_proj",
+    },
+    {
+        "in_features": 3584,
+        "out_features": 37888,
+        "group_size": 128,
+        "comment": "Qwen2.5-7B gate_up_proj",
+    },
+    {
+        "in_features": 18944,
+        "out_features": 3584,
+        "group_size": 128,
+        "comment": "Qwen2.5-7B down_proj",
+    },
+    # W4-L2-cache-boundary
+    {
+        "in_features": 8192,
+        "out_features": 512,
+        "group_size": 128,
+        "comment": "L2 2MiB at",
+    },
+    {
+        "in_features": 8320,
+        "out_features": 512,
+        "group_size": 128,
+        "comment": "L2 2MiB above",
+    },
+]
+
+PROVIDERS = ["hybrid-w4a16", "hybrid-w4a16-zp"]
+BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+TFLOPS_TOLERANCE_PCT = [-10, 10]  # [low, high] allowed deviation from golden
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+
+_HERE = pathlib.Path(__file__).resolve().parent
+_GOLDEN_DIR = _HERE / "golden"
+
+# ---------------------------------------------------------------------------
+# Shape key helper
+# ---------------------------------------------------------------------------
+
+ShapeKey = tuple[int, int, int]  # (in_features, out_features, group_size)
+
+
+def _shape_key(s: dict[str, Any]) -> ShapeKey:
+    return (s["in_features"], s["out_features"], s["group_size"])
+
+
+assert len({_shape_key(s) for s in SHAPES}) == len(SHAPES), "duplicate in SHAPES"
+_SHAPE_KEY_SET: set[ShapeKey] = {_shape_key(s) for s in SHAPES}
+_BATCH_SIZE_SET: set[int] = set(BATCH_SIZES)
+
+# ---------------------------------------------------------------------------
+# GPU detection
+# ---------------------------------------------------------------------------
+
+
+def _get_gcn_arch() -> str:
+    """Return the GCN architecture string, or '' on non-ROCm."""
+    if not current_platform.is_rocm():
+        return ""
+    try:
+        from vllm.platforms.rocm import _GCN_ARCH
+
+        return _GCN_ARCH
+    except ImportError:
+        return ""
+
+
+# ---------------------------------------------------------------------------
+# Golden-data loading
+# ---------------------------------------------------------------------------
+
+
+def _load_golden(gcn_arch: str) -> tuple[str | None, dict[str, Any] | None]:
+    """Find and load the golden JSON for *gcn_arch*.
+
+    Returns ``(filename, data)`` or ``(None, None)`` when no match.
+    """
+    for path in sorted(_GOLDEN_DIR.glob("hybrid_w4a16_*.json")):
+        data = json.loads(path.read_text())
+        if gcn_arch.startswith(data.get("gpu", "")):
+            return path.name, data
+    return None, None
+
+
+def _validate_golden(data: dict[str, Any]) -> None:
+    """Raise on rogue shapes or batch sizes in the golden file."""
+    for shape in data.get("shapes", []):
+        sk = _shape_key(shape)
+        if "skip" not in shape and sk not in _SHAPE_KEY_SET:
+            raise ValueError(
+                f"Golden file contains shape {sk} not in SHAPES. "
+                "Remove it or add it to SHAPES in the test file."
+            )
+        for prov in shape.get("providers", []):
+            for bl in prov.get("baselines", []):
+                bs = bl.get("batch_size")
+                if bs is not None and bs not in _BATCH_SIZE_SET:
+                    raise ValueError(
+                        f"Golden file contains batch_size={bs} "
+                        f"(shape {sk}) not in BATCH_SIZES."
+                    )
+
+
+# ---------------------------------------------------------------------------
+# Weight preparation (ported from benchmark script)
+# ---------------------------------------------------------------------------
+
+
+def prepare_hybrid_weights(
+    K: int, N: int, group_size: int, device: str = "cuda"
+) -> dict[str, torch.Tensor]:
+    """Create random packed weights for benchmarking."""
+    num_groups = K // group_size
+
+    w_q_skinny_i32 = torch.randint(
+        0, 2**31, (N, K // 8), dtype=torch.int32, device=device
+    )
+    w_q_skinny = w_q_skinny_i32.view(torch.int8).contiguous()
+    w_s_skinny = torch.randn(N, num_groups, dtype=torch.float16, device=device) * 0.01
+    w_zp = torch.randint(0, 16, (N, num_groups), dtype=torch.int32, device=device).to(
+        torch.float16
+    )
+
+    return {
+        "w_q_skinny": w_q_skinny,
+        "w_s_skinny": w_s_skinny,
+        "w_q_skinny_i32": w_q_skinny_i32,
+        "w_zp": w_zp,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Core measurement
+# ---------------------------------------------------------------------------
+
+
+COOL_DOWN_DELAY_S = 2
+
+
+def measure_tflops(
+    M: int,
+    weights: dict[str, torch.Tensor],
+    K: int,
+    N: int,
+    group_size: int,
+    provider: str,
+) -> float:
+    """Run the kernel and return median TFLOP/s."""
+    from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import (
+        _hybrid_w4a16_apply_impl,
+    )
+    from vllm.triton_utils import triton
+    from vllm.utils.platform_utils import num_compute_units
+
+    device = "cuda"
+    dtype = torch.float16
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    cu_count = num_compute_units()
+    use_zp = provider == "hybrid-w4a16-zp"
+
+    def run():
+        return _hybrid_w4a16_apply_impl(
+            a,
+            weights["w_q_skinny"],
+            weights["w_s_skinny"],
+            weights["w_q_skinny_i32"],
+            weights["w_zp"] if use_zp else None,
+            None,  # bias
+            cu_count,
+            group_size,
+        )
+
+    ms = triton.testing.do_bench_cudagraph(run, quantiles=[0.5])
+    tflops = (2 * M * N * K) * 1e-12 / (ms * 1e-3)
+    return tflops
+
+
+# ---------------------------------------------------------------------------
+# Parametrize helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_params() -> list[pytest.param]:
+    """Build the SHAPES x PROVIDERS parameter list."""
+    params = []
+    for shape in sorted(SHAPES, key=_shape_key):
+        for prov in PROVIDERS:
+            k = shape["in_features"]
+            n = shape["out_features"]
+            g = shape["group_size"]
+            test_id = f"i{k}-o{n}-g{g}-{prov}"
+            params.append(pytest.param(shape, prov, id=test_id))
+    return params
+
+
+# ---------------------------------------------------------------------------
+# Lookup helpers
+# ---------------------------------------------------------------------------
+
+
+def _find_shape_in_golden(
+    golden: dict[str, Any], key: ShapeKey
+) -> dict[str, Any] | None:
+    for s in golden.get("shapes", []):
+        if _shape_key(s) == key:
+            return s
+    return None
+
+
+def _find_provider_in_shape(
+    shape_data: dict[str, Any], provider: str
+) -> dict[str, Any] | None:
+    for p in shape_data.get("providers", []):
+        if p.get("provider") == provider:
+            return p
+    return None
+
+
+def _find_baseline(
+    provider_data: dict[str, Any], batch_size: int
+) -> dict[str, Any] | None:
+    for bl in provider_data.get("baselines", []):
+        if bl.get("batch_size") == batch_size:
+            return bl
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Measured-results collector for --measure-baselines
+# ---------------------------------------------------------------------------
+
+
+def _record_measurement(
+    config: Any,
+    gpu: str,
+    shape: dict[str, Any],
+    provider: str,
+    batch_size: int,
+    tflops: float,
+    annotations: dict[str, Any] | None = None,
+) -> None:
+    """Append a measurement into the session-scoped collector."""
+    from tests.kernels.quantization.conftest import get_measured_results
+
+    results = get_measured_results(config)
+    shapes_list = results.setdefault(gpu, [])
+    sk = _shape_key(shape)
+
+    # Find or create shape entry
+    shape_entry = None
+    for s in shapes_list:
+        if _shape_key(s) == sk:
+            shape_entry = s
+            break
+    if shape_entry is None:
+        shape_entry = {
+            "in_features": sk[0],
+            "out_features": sk[1],
+            "group_size": sk[2],
+            "comment": shape.get("comment", ""),
+            "providers": [],
+        }
+        shapes_list.append(shape_entry)
+        # Keep sorted
+        shapes_list.sort(key=_shape_key)
+
+    # Find or create provider entry
+    prov_entry = _find_provider_in_shape(shape_entry, provider)
+    if prov_entry is None:
+        prov_entry = {"provider": provider, "baselines": []}
+        shape_entry["providers"].append(prov_entry)
+
+    # Build baseline entry
+    bl: dict[str, Any] = {"batch_size": batch_size, "tflops": round(tflops, 4)}
+    if annotations:
+        bl.update(annotations)
+    prov_entry["baselines"].append(bl)
+    # Keep batch_sizes sorted
+    prov_entry["baselines"].sort(key=lambda b: b["batch_size"])
+
+
+def _record_skip(
+    config: Any,
+    gpu: str,
+    shape: dict[str, Any],
+    provider: str,
+    batch_size: int,
+    reason: str,
+) -> None:
+    """Record a skipped entry in measured results."""
+    _record_measurement(
+        config,
+        gpu,
+        shape,
+        provider,
+        batch_size,
+        0.0,
+        annotations={"skip": reason},
+    )
+    # Remove the tflops field since it's not meaningful
+    from tests.kernels.quantization.conftest import get_measured_results
+
+    shapes_list = get_measured_results(config)[gpu]
+    for s in shapes_list:
+        if _shape_key(s) == _shape_key(shape):
+            for p in s["providers"]:
+                if p["provider"] == provider:
+                    for bl in p["baselines"]:
+                        if bl["batch_size"] == batch_size and "skip" in bl:
+                            bl.pop("tflops", None)
+
+
+# ---------------------------------------------------------------------------
+# The test
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def _warm_up_gpu():
+    """Run a throwaway measurement pass to bring the GPU to steady-state temp."""
+    if current_platform.is_rocm():
+        temp = _read_gpu_temp()
+        print(f"GPU temperature: {temp:.0f}\u00b0C")
+        if temp < 60.0:
+            print("Warming up GPU...")
+            shape = sorted(SHAPES, key=_shape_key)[0]
+            K, N, gs = shape["in_features"], shape["out_features"], shape["group_size"]
+            weights = prepare_hybrid_weights(K, N, gs)
+            for bs in BATCH_SIZES:
+                measure_tflops(bs, weights, K, N, gs, PROVIDERS[0])
+            del weights
+            time.sleep(COOL_DOWN_DELAY_S)
+    yield
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize("shape,provider", _make_params())
+def test_hybrid_w4a16_perf(
+    shape: dict[str, Any],
+    provider: str,
+    request: pytest.FixtureRequest,
+    _warm_up_gpu: None,
+) -> None:
+    # ---- gate ----
+    gcn_arch = _get_gcn_arch()
+    if not current_platform.is_rocm() or not gcn_arch.startswith("gfx1151"):
+        pytest.skip("ROCm gfx1151 only")
+
+    measure_mode = request.config.getoption("--measure-baselines", default=False)
+    intermittent_mode = (
+        request.config.getoption("--intermittent", default=False) or measure_mode
+    )
+
+    # ---- load golden ----
+    golden_fname, golden = _load_golden(gcn_arch)
+    if golden is not None:
+        _validate_golden(golden)
+
+    if golden is None and not measure_mode:
+        pytest.skip(f"No golden baselines for {gcn_arch}")
+
+    sk = _shape_key(shape)
+    K, N, group_size = sk
+
+    # ---- shape-level skip ----
+    if golden is not None:
+        shape_data = _find_shape_in_golden(golden, sk)
+        if shape_data is not None and "skip" in shape_data:
+            pytest.skip(shape_data["skip"])
+    else:
+        shape_data = None
+
+    # ---- provider-level skip ----
+    if shape_data is not None:
+        prov_data = _find_provider_in_shape(shape_data, provider)
+        if prov_data is not None and "skip" in prov_data:
+            pytest.skip(prov_data["skip"])
+    else:
+        prov_data = None
+
+    # If shape not in golden, skip in normal mode
+    if golden is not None and shape_data is None and not measure_mode:
+        pytest.skip(f"Shape {sk} not yet measured on {gcn_arch}")
+    if (
+        golden is not None
+        and shape_data is not None
+        and prov_data is None
+        and not measure_mode
+    ):
+        pytest.skip(f"Provider {provider} not yet measured for shape {sk}")
+
+    # ---- cooldown + temperature log ----
+    test_id = f"i{K}-o{N}-g{group_size}-{provider}"
+    _log_temp(request.config, f"{test_id}:pre-sleep")
+    time.sleep(COOL_DOWN_DELAY_S)
+    _log_temp(request.config, f"{test_id}:post-sleep")
+
+    # ---- allocate weights once ----
+    weights = prepare_hybrid_weights(K, N, group_size)
+
+    # ---- iterate batch sizes ----
+    failures: list[str] = []
+    xpass_list: list[str] = []
+    exit_first = getattr(request.config.option, "exitfirst", False)
+
+    for bs in BATCH_SIZES:
+        # Look up baseline
+        bl_entry: dict[str, Any] | None = None
+        if prov_data is not None:
+            bl_entry = _find_baseline(prov_data, bs)
+
+        # ---- skip annotations ----
+        if bl_entry is not None and "skip" in bl_entry:
+            reason = bl_entry["skip"]
+            print(f"  batch_size={bs}: SKIP ({reason})")
+            if measure_mode:
+                _record_skip(request.config, gcn_arch, shape, provider, bs, reason)
+            continue
+
+        # ---- intermittent handling ----
+        is_intermittent = bl_entry is not None and bl_entry.get("intermittent", False)
+        if is_intermittent and not intermittent_mode:
+            print(f"  batch_size={bs}: SKIP (intermittent)")
+            continue
+
+        # ---- measure ----
+        _log_temp(request.config, f"{test_id}:bs{bs}:pre")
+        tflops = measure_tflops(bs, weights, K, N, group_size, provider)
+        post_temp = _log_temp(request.config, f"{test_id}:bs{bs}:post")
+        temp_tag = f" [{post_temp:.0f}\u00b0C]"
+
+        if measure_mode:
+            # Carry forward annotations
+            annot: dict[str, Any] = {}
+            if bl_entry is not None:
+                for key in ("expected_failure", "intermittent"):
+                    if key in bl_entry:
+                        annot[key] = bl_entry[key]
+            _record_measurement(
+                request.config, gcn_arch, shape, provider, bs, tflops, annot
+            )
+            print(f"  batch_size={bs}: {tflops:.2f} TFLOP/s (measured){temp_tag}")
+            continue
+
+        if bl_entry is None:
+            print(
+                f"  batch_size={bs}: {tflops:.2f} TFLOP/s "
+                f"(no golden value, skipping assertion){temp_tag}"
+            )
+            continue
+
+        expected = bl_entry["tflops"]
+
+        # ---- sanity check golden value ----
+        assert expected > 0 and math.isfinite(expected), (
+            f"Golden tflops={expected} for batch_size={bs} is invalid. "
+            "Run --measure-baselines to populate."
+        )
+
+        # ---- tolerance band ----
+        lo = expected * (1 + TFLOPS_TOLERANCE_PCT[0] / 100)
+        hi = expected * (1 + TFLOPS_TOLERANCE_PCT[1] / 100)
+        in_band = lo <= tflops <= hi
+
+        has_xfail = "expected_failure" in bl_entry
+
+        if has_xfail:
+            if in_band:
+                xpass_list.append(
+                    f"  batch_size={bs}: {tflops:.2f} TFLOP/s is now within "
+                    f"band [{lo:.2f}, {hi:.2f}] -- remove expected_failure "
+                    f"annotation: {bl_entry['expected_failure']}"
+                )
+                print(
+                    f"  batch_size={bs}: {tflops:.2f} TFLOP/s "
+                    f"(expected {expected:.2f} "
+                    f"+ {TFLOPS_TOLERANCE_PCT}%) XPASS{temp_tag}"
+                )
+            else:
+                print(
+                    f"  batch_size={bs}: {tflops:.2f} TFLOP/s "
+                    f"(expected {expected:.2f} "
+                    f"+ {TFLOPS_TOLERANCE_PCT}%) "
+                    f"XFAIL: {bl_entry['expected_failure']}{temp_tag}"
+                )
+            continue
+
+        if in_band:
+            print(
+                f"  batch_size={bs}: {tflops:.2f} TFLOP/s "
+                f"(expected {expected:.2f} "
+                f"+ {TFLOPS_TOLERANCE_PCT}%) PASS{temp_tag}"
+            )
+        else:
+            direction = "regression" if tflops < lo else "improvement"
+            delta_pct = (tflops - expected) / expected * 100
+            msg = (
+                f"  batch_size={bs}: {tflops:.2f} TFLOP/s "
+                f"(expected {expected:.2f} "
+                f"+ {TFLOPS_TOLERANCE_PCT}%) "
+                f"FAIL ({direction}, {delta_pct:+.1f}%){temp_tag}"
+            )
+            print(msg)
+            if exit_first:
+                if direction == "regression":
+                    pytest.fail(
+                        f"Performance regression at batch_size={bs}: "
+                        f"{tflops:.2f} < {lo:.2f} TFLOP/s. "
+                        "Run --measure-baselines to update."
+                    )
+                else:
+                    pytest.fail(
+                        f"Performance improved at batch_size={bs}: "
+                        f"{tflops:.2f} > {hi:.2f} TFLOP/s. "
+                        "Run --measure-baselines to update baselines."
+                    )
+            failures.append(msg)
+
+    # ---- report xpass ----
+    if xpass_list:
+        raise AssertionError(
+            "Unexpected passes -- remove expected_failure annotations:\n"
+            + "\n".join(xpass_list)
+        )
+
+    # ---- report failures ----
+    if failures:
+        raise AssertionError(
+            f"{len(failures)} batch size(s) out of tolerance band. "
+            "Run --measure-baselines to update.\n" + "\n".join(failures)
+        )