From e5e7ea19c784d2e9105beff01111a782ee412a54 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 19 Apr 2026 22:31:10 +1200
Subject: [PATCH 1/4] more simd backend

---
 src/row/arch/neon.rs         | 200 +++++++++++++++++++----------------
 src/row/arch/wasm_simd128.rs |  56 +++++-----
 src/row/arch/x86_avx2.rs     |  56 +++++-----
 src/row/arch/x86_avx512.rs   |  56 +++++-----
 src/row/arch/x86_sse41.rs    |  64 ++++++-----
 src/row/mod.rs               |  48 ++++-----
 src/row/scalar.rs            | 101 ++++++++++--------
 7 files changed, 318 insertions(+), 263 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index f98d9cd..54e7a07 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -39,8 +39,8 @@ use core::arch::aarch64::{
   vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8,
   vld1q_u8, vld1q_u16, vld2_u8, vld2q_u16, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16,
   vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32,
-  vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32,
-  vshrq_n_u16, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
+  vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshlq_u16,
+  vshrq_n_s32, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
 };
 
 use crate::{ColorMatrix, row::scalar};
@@ -190,7 +190,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
-/// NEON YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed
+/// **8‑bit** RGB.
 ///
 /// Block size is 16 Y pixels / 8 chroma pairs per iteration. The
 /// pipeline mirrors [`yuv_420_to_rgb_row`] byte‑for‑byte; the only
@@ -199,16 +200,20 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///   (16 lanes of `u8`), so each Y iteration needs two Y loads to
 ///   cover 16 pixels — there's no widening step because the samples
 ///   already live in 16‑bit lanes.
-/// - Chroma bias is **512** (10‑bit center) rather than 128.
+/// - Chroma bias is `128 << (BITS - 8)` (512 for 10‑bit, 2048 for
+///   12‑bit, 8192 for 14‑bit) rather than 128.
 /// - Range‑scaling params come from [`scalar::range_params_n`] with
-///   `BITS = 10, OUT_BITS = 8`, so `y_scale` / `c_scale` are ~¼ the
-///   8‑bit values (mapping 10‑bit input to 8‑bit output).
+///   the matching `BITS` const, so `y_scale` / `c_scale` map the
+///   source depth to 8‑bit output in a single Q15 shift.
+/// - Each load is AND‑masked to the low `BITS` bits so out‑of‑range
+///   samples (e.g. high‑bit‑packed data mistakenly handed to the
+///   low‑packed kernel) produce deterministic, backend‑consistent
+///   output.
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`] — every
-/// Q15 multiply / shift mirrors the scalar path exactly, with the
-/// same `(prod + (1 << 14)) >> 15` rounding.
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<BITS>`] across
+/// all supported bit depths.
 ///
 /// # Safety
 ///
@@ -216,9 +221,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+/// 4. `BITS` must be one of `{10, 12, 14}` — the Q15 pipeline
+///    overflows i32 at 16 bits; see [`scalar::range_params_n`].
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv420p10_to_rgb_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -234,8 +241,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: NEON availability is the caller's obligation; the
@@ -248,7 +255,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let y_scale_v = vdupq_n_s32(y_scale);
     let c_scale_v = vdupq_n_s32(c_scale);
     let bias_v = vdupq_n_s16(bias as i16);
-    let mask_v = vdupq_n_u16(scalar::bits_mask::<10>());
+    let mask_v = vdupq_n_u16(scalar::bits_mask::<BITS>());
     let cru = vdupq_n_s32(coeffs.r_u());
     let crv = vdupq_n_s32(coeffs.r_v());
     let cgu = vdupq_n_s32(coeffs.g_u());
@@ -259,11 +266,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let mut x = 0usize;
     while x + 16 <= width {
       // Two Y loads cover 16 lanes; one U load + one V load cover 8
-      // chroma each. Each load is AND‑masked to the low 10 bits so
-      // out‑of‑range samples (e.g. `p010`‑style packing with the
-      // 10 active bits in the high 10 of each u16) can never push
-      // an intermediate past i16 range. For valid input the AND is
-      // a no‑op (samples already in [0, 1023]).
+      // chroma each. Each load is AND‑masked to the low BITS bits so
+      // out‑of‑range samples (e.g. high‑bit‑packed data handed to
+      // the low‑packed kernel) can never push an intermediate past
+      // i16 range. For valid input the AND is a no‑op.
       let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v);
       let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v);
       let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v);
@@ -325,7 +331,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 
     // Scalar tail — remaining < 16 pixels (always even per 4:2:0).
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<10>(
+      scalar::yuv_420p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -338,24 +344,25 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   }
 }
 
-/// NEON YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB (native depth).
+/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed
+/// **native‑depth `u16`** RGB.
 ///
 /// Block size is 16 Y pixels / 8 chroma pairs per iteration. Shares
-/// all pre‑write math with [`yuv420p10_to_rgb_row`]; the only
+/// all pre‑write math with [`yuv_420p_n_to_rgb_row`]; the only
 /// difference is the final clamp + write:
-/// - Y‑path scale is calibrated for `OUT_BITS = 10` rather than 8,
-///   so `y_scaled` lives in `[0, 1023]` before the chroma add.
-/// - The `y_scaled + chroma` sum is clamped to `[0, 1023]` with
-///   `vmaxq_s16(vminq_s16(_, 1023), 0)` — a simple saturate‑narrow
-///   doesn't suffice because the sum can overshoot 1023 (up to ~2046
-///   without saturating at i16 bounds).
+/// - Y‑path scale is calibrated for `OUT_BITS = BITS` rather than 8,
+///   so `y_scaled` lives in `[0, (1 << BITS) - 1]`.
+/// - The `y_scaled + chroma` sum is clamped to `[0, (1 << BITS) - 1]`
+///   with `vmaxq_s16(vminq_s16(_, max), 0)` — a simple saturate‑
+///   narrow doesn't suffice because the sum can overshoot the
+///   `BITS`-bit max without saturating at i16 bounds.
 /// - Writes use two `vst3q_u16` calls per iteration — each handles 8
 ///   pixels × 3 channels = 24 `u16` elements, so two cover 16 pixels.
 ///
 /// # Numerical contract
 ///
-/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`] — every
-/// Q15 multiply / shift / clamp mirrors the scalar reference.
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<BITS>`] across
+/// supported `BITS` values.
 ///
 /// # Safety
 ///
@@ -363,9 +370,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+/// 4. `BITS` must be one of `{10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -381,10 +389,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: NEON availability is the caller's obligation; the
   // dispatcher in `crate::row` verifies it. Pointer adds are bounded
@@ -396,8 +404,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     let y_scale_v = vdupq_n_s32(y_scale);
     let c_scale_v = vdupq_n_s32(c_scale);
     let bias_v = vdupq_n_s16(bias as i16);
-    let mask_v = vdupq_n_u16(scalar::bits_mask::<10>());
-    let max_v = vdupq_n_s16(OUT_MAX_10);
+    let mask_v = vdupq_n_u16(scalar::bits_mask::<BITS>());
+    let max_v = vdupq_n_s16(out_max);
     let zero_v = vdupq_n_s16(0);
     let cru = vdupq_n_s32(coeffs.r_u());
     let crv = vdupq_n_s32(coeffs.r_v());
@@ -408,9 +416,9 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      // AND‑mask each load to the low 10 bits so intermediates stay
-      // within the i16 range the Q15 narrow steps expect — see
-      // matching comment in [`yuv420p10_to_rgb_row`].
+      // AND‑mask each load to the low BITS bits so intermediates
+      // stay within the i16 range the Q15 narrow steps expect — see
+      // matching comment in [`yuv_420p_n_to_rgb_row`].
       let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v);
       let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v);
       let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v);
@@ -447,9 +455,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
       let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v);
 
       // Native‑depth output: add Y + chroma in i16, then clamp to
-      // [0, 1023] explicitly. `vqaddq_s16` saturates at i16 bounds
-      // (irrelevant here since |sum| < 2047 always), so the subsequent
-      // max/min clamps to the 10‑bit range.
+      // [0, (1 << BITS) - 1] explicitly. `vqaddq_s16` saturates at
+      // i16 bounds (irrelevant here: |sum| stays well inside i16
+      // for BITS ≤ 14), so the subsequent max/min clamps to the
+      // native bit depth.
       let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v);
       let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v);
       let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v);
@@ -467,7 +476,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -481,33 +490,35 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
 }
 
 /// Clamps an i16x8 vector to `[0, max]` and reinterprets to u16x8.
-/// Used by the 10‑bit u16 output path to avoid `vqmovun_s16`'s u8
-/// saturation.
+/// Used by the native‑depth u16 output paths to avoid `vqmovun_s16`'s
+/// u8 saturation.
 #[inline(always)]
 fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t {
   unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) }
 }
 
-/// NEON P010 → packed **8‑bit** RGB.
+/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}: P010, P012)
+/// → packed **8‑bit** RGB.
 ///
 /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences
-/// from [`yuv420p10_to_rgb_row`]:
+/// from [`yuv_420p_n_to_rgb_row`]:
 /// - UV is semi‑planar interleaved (`U0, V0, U1, V1, …`), split in
 ///   one shot via `vld2q_u16` (returns separate U and V vectors).
-/// - Each `u16` load is **shifted right by 6** (`vshrq_n_u16::<6>`)
-///   instead of AND‑masked — P010 packs its 10 active bits in the
-///   HIGH 10 of each `u16`, so `>> 6` extracts the value and
-///   simultaneously clears the low 6 bits (which the format mandates
-///   are zero anyway; the shift makes mispacked input deterministic).
-/// - Chroma bias is 512 (10‑bit center) after the shift.
+/// - Each `u16` load is **right‑shifted by `16 - BITS`** — 6 for
+///   P010, 4 for P012 — extracting the `BITS` active bits from the
+///   high bits of each `u16` and clearing the low bits. The shift
+///   runs via `vshlq_u16` with a negative loop‑invariant count so a
+///   single kernel serves all supported bit depths.
 ///
 /// After the shift, the rest of the pipeline is identical to the
-/// `yuv420p10` path — same `chroma_i16x8` / `scale_y` / `chroma_dup`
-/// / `vst3q_u8` write, with `range_params_n::<10, 8>` scaling.
+/// low‑bit‑packed planar path — same `chroma_i16x8` / `scale_y` /
+/// `chroma_dup` / `vst3q_u8` write, with `range_params_n::<BITS, 8>`
+/// scaling.
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<BITS>`] across all
+/// supported `BITS` values.
 ///
 /// # Safety
 ///
@@ -515,9 +526,10 @@ fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t {
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
+/// 4. `BITS` must be one of `{10, 12}`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn p010_to_rgb_row(
+pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -531,8 +543,8 @@ pub(crate) unsafe fn p010_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: NEON availability is the caller's obligation.
@@ -542,6 +554,9 @@ pub(crate) unsafe fn p010_to_rgb_row(
     let y_scale_v = vdupq_n_s32(y_scale);
     let c_scale_v = vdupq_n_s32(c_scale);
     let bias_v = vdupq_n_s16(bias as i16);
+    // `vshlq_u16` performs right shift when the count is negative.
+    // Count = -(16 - BITS) extracts the `BITS` active high bits.
+    let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
     let cru = vdupq_n_s32(coeffs.r_u());
     let crv = vdupq_n_s32(coeffs.r_v());
     let cgu = vdupq_n_s32(coeffs.g_u());
@@ -551,17 +566,16 @@ pub(crate) unsafe fn p010_to_rgb_row(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      // 16 Y pixels in two u16x8 loads, shifted right by 6 to extract
-      // the 10‑bit values from P010's high‑bit packing.
-      let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x)));
-      let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8)));
+      // 16 Y pixels in two u16x8 loads, right-shifted by 16-BITS to
+      // extract the active bits from the high-bit packing.
+      let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count);
+      let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count);
 
       // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements
-      // and returns (evens, odds) = (U, V) in one shot. Each gets the
-      // same `>> 6` shift as Y.
+      // and returns (evens, odds) = (U, V) in one shot.
       let uv_pair = vld2q_u16(uv_half.as_ptr().add(x));
-      let u_vec = vshrq_n_u16::<6>(uv_pair.0);
-      let v_vec = vshrq_n_u16::<6>(uv_pair.1);
+      let u_vec = vshlq_u16(uv_pair.0, shr_count);
+      let v_vec = vshlq_u16(uv_pair.1, shr_count);
 
       let y_lo = vreinterpretq_s16_u16(y_vec_lo);
       let y_hi = vreinterpretq_s16_u16(y_vec_hi);
@@ -613,7 +627,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_row(
+      scalar::p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -625,8 +639,9 @@ pub(crate) unsafe fn p010_to_rgb_row(
   }
 }
 
-/// NEON P010 → packed **10‑bit `u16`** RGB (native‑depth, low‑bit‑
-/// packed output — `yuv420p10le` convention, not P010).
+/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **native‑depth `u16`** RGB (low‑bit‑packed output,
+/// `yuv420p10le` / `yuv420p12le` convention — not P010/P012).
 ///
 /// Same structure as [`p010_to_rgb_row`] up to the chroma compute;
 /// the only differences are:
@@ -638,7 +653,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
 ///
 /// # Safety
 ///
@@ -648,7 +663,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn p010_to_rgb_u16_row(
+pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -662,10 +677,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: NEON availability is the caller's obligation.
   unsafe {
@@ -674,7 +689,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     let y_scale_v = vdupq_n_s32(y_scale);
     let c_scale_v = vdupq_n_s32(c_scale);
     let bias_v = vdupq_n_s16(bias as i16);
-    let max_v = vdupq_n_s16(OUT_MAX_10);
+    let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
+    let max_v = vdupq_n_s16(out_max);
     let zero_v = vdupq_n_s16(0);
     let cru = vdupq_n_s32(coeffs.r_u());
     let crv = vdupq_n_s32(coeffs.r_v());
@@ -685,11 +701,11 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x)));
-      let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8)));
+      let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count);
+      let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count);
       let uv_pair = vld2q_u16(uv_half.as_ptr().add(x));
-      let u_vec = vshrq_n_u16::<6>(uv_pair.0);
-      let v_vec = vshrq_n_u16::<6>(uv_pair.1);
+      let u_vec = vshlq_u16(uv_pair.0, shr_count);
+      let v_vec = vshlq_u16(uv_pair.1, shr_count);
 
       let y_lo = vreinterpretq_s16_u16(y_vec_lo);
       let y_hi = vreinterpretq_s16_u16(y_vec_hi);
@@ -737,7 +753,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_u16_row(
+      scalar::p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -1705,7 +1721,7 @@ mod tests {
 
     scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+      yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_neon {
@@ -1730,7 +1746,7 @@ mod tests {
 
     scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+      yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_neon {
@@ -1851,7 +1867,7 @@ mod tests {
             full_range,
           );
           unsafe {
-            yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+            yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
           }
           assert_eq!(
             rgb_scalar, rgb_neon,
@@ -1870,7 +1886,7 @@ mod tests {
             full_range,
           );
           unsafe {
-            yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range);
+            yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range);
           }
           assert_eq!(
             rgb16_scalar, rgb16_neon,
@@ -1913,9 +1929,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_neon = std::vec![0u8; width * 3];
 
-    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+      p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range);
     }
     if rgb_scalar != rgb_neon {
       let diff = rgb_scalar
@@ -1938,9 +1954,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_neon = std::vec![0u16; width * 3];
 
-    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+      p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range);
     }
     if rgb_scalar != rgb_neon {
       let diff = rgb_scalar
@@ -2036,9 +2052,9 @@ mod tests {
         for full_range in [true, false] {
           let mut rgb_scalar = std::vec![0u8; width * 3];
           let mut rgb_neon = std::vec![0u8; width * 3];
-          scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+          scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
           unsafe {
-            p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+            p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range);
           }
           assert_eq!(
             rgb_scalar, rgb_neon,
@@ -2047,9 +2063,9 @@ mod tests {
 
           let mut rgb16_scalar = std::vec![0u16; width * 3];
           let mut rgb16_neon = std::vec![0u16; width * 3];
-          scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb16_scalar, width, matrix, full_range);
+          scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range);
           unsafe {
-            p010_to_rgb_u16_row(&y, &uv, &mut rgb16_neon, width, matrix, full_range);
+            p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_neon, width, matrix, full_range);
           }
           assert_eq!(
             rgb16_scalar, rgb16_neon,
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 21efa7c..35f068e 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -214,7 +214,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv420p10_to_rgb_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -230,8 +230,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: simd128 compile‑time availability is the caller's
@@ -242,7 +242,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let y_scale_v = i32x4_splat(y_scale);
     let c_scale_v = i32x4_splat(c_scale);
     let bias_v = i16x8_splat(bias as i16);
-    let mask_v = u16x8_splat(scalar::bits_mask::<10>());
+    let mask_v = u16x8_splat(scalar::bits_mask::<BITS>());
     let cru = i32x4_splat(coeffs.r_u());
     let crv = i32x4_splat(coeffs.r_v());
     let cgu = i32x4_splat(coeffs.g_u());
@@ -303,7 +303,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<10>(
+      scalar::yuv_420p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -334,7 +334,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -350,8 +350,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -363,7 +363,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     let y_scale_v = i32x4_splat(y_scale);
     let c_scale_v = i32x4_splat(c_scale);
     let bias_v = i16x8_splat(bias as i16);
-    let mask_v = u16x8_splat(scalar::bits_mask::<10>());
+    let mask_v = u16x8_splat(scalar::bits_mask::<BITS>());
     let max_v = i16x8_splat(OUT_MAX_10);
     let zero_v = i16x8_splat(0);
     let cru = i32x4_splat(coeffs.r_u());
@@ -424,7 +424,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -511,7 +511,7 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
 ///
 /// # Safety
 ///
@@ -521,7 +521,7 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn p010_to_rgb_row(
+pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -535,8 +535,8 @@ pub(crate) unsafe fn p010_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: simd128 compile‑time availability is the caller's
@@ -606,7 +606,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_row(
+      scalar::p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -623,7 +623,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
 ///
 /// # Safety
 ///
@@ -633,7 +633,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn p010_to_rgb_u16_row(
+pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -647,8 +647,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -719,7 +719,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_u16_row(
+      scalar::p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -1609,7 +1609,7 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1634,7 +1634,15 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
     unsafe {
       yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1724,7 +1732,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1738,7 +1746,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index e5b6db7..a0f0e8d 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -234,7 +234,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv420p10_to_rgb_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -250,8 +250,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: AVX2 availability is the caller's obligation.
@@ -261,7 +261,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let y_scale_v = _mm256_set1_epi32(y_scale);
     let c_scale_v = _mm256_set1_epi32(c_scale);
     let bias_v = _mm256_set1_epi16(bias as i16);
-    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let cru = _mm256_set1_epi32(coeffs.r_u());
     let crv = _mm256_set1_epi32(coeffs.r_v());
     let cgu = _mm256_set1_epi32(coeffs.g_u());
@@ -338,7 +338,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<10>(
+      scalar::yuv_420p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -375,7 +375,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -391,8 +391,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -403,7 +403,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     let y_scale_v = _mm256_set1_epi32(y_scale);
     let c_scale_v = _mm256_set1_epi32(c_scale);
     let bias_v = _mm256_set1_epi16(bias as i16);
-    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let max_v = _mm256_set1_epi16(OUT_MAX_10);
     let zero_v = _mm256_set1_epi16(0);
     let cru = _mm256_set1_epi32(coeffs.r_u());
@@ -505,7 +505,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -538,7 +538,7 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i {
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
 ///
 /// # Safety
 ///
@@ -548,7 +548,7 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i {
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn p010_to_rgb_row(
+pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -562,8 +562,8 @@ pub(crate) unsafe fn p010_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: AVX2 availability is the caller's obligation.
@@ -644,7 +644,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_row(
+      scalar::p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -661,7 +661,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
 ///
 /// # Safety
 ///
@@ -671,7 +671,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn p010_to_rgb_u16_row(
+pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -685,8 +685,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -787,7 +787,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_u16_row(
+      scalar::p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -1608,7 +1608,7 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1636,7 +1636,15 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
     unsafe {
       yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1729,7 +1737,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1746,7 +1754,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 19caeaa..8e4ece2 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -251,7 +251,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv420p10_to_rgb_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -267,8 +267,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: AVX‑512BW availability is the caller's obligation.
@@ -278,7 +278,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let y_scale_v = _mm512_set1_epi32(y_scale);
     let c_scale_v = _mm512_set1_epi32(c_scale);
     let bias_v = _mm512_set1_epi16(bias as i16);
-    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let cru = _mm512_set1_epi32(coeffs.r_u());
     let crv = _mm512_set1_epi32(coeffs.r_v());
     let cgu = _mm512_set1_epi32(coeffs.g_u());
@@ -358,7 +358,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<10>(
+      scalar::yuv_420p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -391,7 +391,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -407,8 +407,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -419,7 +419,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     let y_scale_v = _mm512_set1_epi32(y_scale);
     let c_scale_v = _mm512_set1_epi32(c_scale);
     let bias_v = _mm512_set1_epi16(bias as i16);
-    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let max_v = _mm512_set1_epi16(OUT_MAX_10);
     let zero_v = _mm512_set1_epi16(0);
     let cru = _mm512_set1_epi32(coeffs.r_u());
@@ -508,7 +508,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -585,7 +585,7 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
 ///
 /// # Safety
 ///
@@ -595,7 +595,7 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn p010_to_rgb_row(
+pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -609,8 +609,8 @@ pub(crate) unsafe fn p010_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: AVX‑512BW availability is the caller's obligation.
@@ -692,7 +692,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_row(
+      scalar::p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -709,7 +709,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
 ///
 /// # Safety
 ///
@@ -719,7 +719,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn p010_to_rgb_u16_row(
+pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -733,8 +733,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -823,7 +823,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_u16_row(
+      scalar::p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -1645,7 +1645,7 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1673,7 +1673,15 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
     unsafe {
       yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1766,7 +1774,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1783,7 +1791,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 66e385b..b91d98e 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -37,10 +37,10 @@
 
 use core::arch::x86_64::{
   __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16,
-  _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32,
-  _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8,
-  _mm_srai_epi32, _mm_srli_epi16, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16,
-  _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64,
+  _mm_cvtsi32_si128, _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16,
+  _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32,
+  _mm_setr_epi8, _mm_shuffle_epi8, _mm_srai_epi32, _mm_srl_epi16, _mm_srli_si128, _mm_sub_epi16,
+  _mm_unpackhi_epi16, _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64,
 };
 
 use crate::{
@@ -206,7 +206,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
 ///
 /// # Safety
 ///
@@ -216,7 +216,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn p010_to_rgb_row(
+pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -230,8 +230,8 @@ pub(crate) unsafe fn p010_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: SSE4.1 availability is the caller's obligation.
@@ -304,7 +304,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_row(
+      scalar::p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -321,7 +321,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
 ///
 /// # Safety
 ///
@@ -331,7 +331,7 @@ pub(crate) unsafe fn p010_to_rgb_row(
 ///    `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn p010_to_rgb_u16_row(
+pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -345,8 +345,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -415,7 +415,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::p010_to_rgb_u16_row(
+      scalar::p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &uv_half[x..width],
         &mut rgb_out[x * 3..width * 3],
@@ -486,7 +486,7 @@ unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) {
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv420p10_to_rgb_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -502,8 +502,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
 
   // SAFETY: SSE4.1 availability is the caller's obligation; the
@@ -516,7 +516,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     let y_scale_v = _mm_set1_epi32(y_scale);
     let c_scale_v = _mm_set1_epi32(c_scale);
     let bias_v = _mm_set1_epi16(bias as i16);
-    let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let cru = _mm_set1_epi32(coeffs.r_u());
     let crv = _mm_set1_epi32(coeffs.r_v());
     let cgu = _mm_set1_epi32(coeffs.g_u());
@@ -579,7 +579,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<10>(
+      scalar::yuv_420p_n_to_rgb_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -614,7 +614,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row(
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
@@ -630,8 +630,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
   debug_assert!(rgb_out.len() >= width * 3);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
-  let bias = scalar::chroma_bias::<10>();
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
+  let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
   const OUT_MAX_10: i16 = 1023;
 
@@ -642,7 +642,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     let y_scale_v = _mm_set1_epi32(y_scale);
     let c_scale_v = _mm_set1_epi32(c_scale);
     let bias_v = _mm_set1_epi16(bias as i16);
-    let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let mask_v = _mm_set1_epi16(scalar::bits_mask::<BITS>() as i16);
     let max_v = _mm_set1_epi16(OUT_MAX_10);
     let zero_v = _mm_set1_epi16(0);
     let cru = _mm_set1_epi32(coeffs.r_u());
@@ -708,7 +708,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
@@ -1387,7 +1387,7 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1415,7 +1415,15 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
     unsafe {
       yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1508,7 +1516,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
@@ -1525,7 +1533,7 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
       p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 80afab7..f6257fc 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -350,7 +350,7 @@ pub fn yuv420p10_to_rgb_row(
           // SAFETY: NEON verified on this CPU; bounds / parity are
           // the caller's obligation (asserted above).
           unsafe {
-            arch::neon::yuv420p10_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+            arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -359,7 +359,7 @@ pub fn yuv420p10_to_rgb_row(
         if avx512_available() {
           // SAFETY: AVX‑512BW verified.
           unsafe {
-            arch::x86_avx512::yuv420p10_to_rgb_row(
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -368,7 +368,7 @@ pub fn yuv420p10_to_rgb_row(
         if avx2_available() {
           // SAFETY: AVX2 verified.
           unsafe {
-            arch::x86_avx2::yuv420p10_to_rgb_row(
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -377,7 +377,7 @@ pub fn yuv420p10_to_rgb_row(
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
           unsafe {
-            arch::x86_sse41::yuv420p10_to_rgb_row(
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -388,7 +388,7 @@ pub fn yuv420p10_to_rgb_row(
         if simd128_available() {
           // SAFETY: simd128 compile‑time verified.
           unsafe {
-            arch::wasm_simd128::yuv420p10_to_rgb_row(
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -444,7 +444,7 @@ pub fn yuv420p10_to_rgb_u16_row(
         if neon_available() {
           // SAFETY: NEON verified.
           unsafe {
-            arch::neon::yuv420p10_to_rgb_u16_row(
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -455,7 +455,7 @@ pub fn yuv420p10_to_rgb_u16_row(
         if avx512_available() {
           // SAFETY: AVX‑512BW verified.
           unsafe {
-            arch::x86_avx512::yuv420p10_to_rgb_u16_row(
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -464,7 +464,7 @@ pub fn yuv420p10_to_rgb_u16_row(
         if avx2_available() {
           // SAFETY: AVX2 verified.
           unsafe {
-            arch::x86_avx2::yuv420p10_to_rgb_u16_row(
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -473,7 +473,7 @@ pub fn yuv420p10_to_rgb_u16_row(
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
           unsafe {
-            arch::x86_sse41::yuv420p10_to_rgb_u16_row(
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -484,7 +484,7 @@ pub fn yuv420p10_to_rgb_u16_row(
         if simd128_available() {
           // SAFETY: simd128 compile‑time verified.
           unsafe {
-            arch::wasm_simd128::yuv420p10_to_rgb_u16_row(
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>(
               y, u_half, v_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -504,7 +504,7 @@ pub fn yuv420p10_to_rgb_u16_row(
 ///
 /// This is the HDR hardware‑decode keystone format: VideoToolbox,
 /// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
-/// output. See `scalar::p010_to_rgb_row` for the full semantic
+/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic
 /// specification. `use_simd = false` forces the scalar reference.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
@@ -529,7 +529,7 @@ pub fn p010_to_rgb_row(
         if neon_available() {
           // SAFETY: NEON verified.
           unsafe {
-            arch::neon::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -538,21 +538,21 @@ pub fn p010_to_rgb_row(
         if avx512_available() {
           // SAFETY: AVX‑512BW verified.
           unsafe {
-            arch::x86_avx512::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
           unsafe {
-            arch::x86_avx2::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
           unsafe {
-            arch::x86_sse41::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -561,7 +561,7 @@ pub fn p010_to_rgb_row(
         if simd128_available() {
           // SAFETY: simd128 compile‑time verified.
           unsafe {
-            arch::wasm_simd128::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -570,7 +570,7 @@ pub fn p010_to_rgb_row(
     }
   }
 
-  scalar::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+  scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
 /// Converts one row of **P010** to **native‑depth `u16`** packed RGB
@@ -579,7 +579,7 @@ pub fn p010_to_rgb_row(
 /// Callers feeding this output into a P010 consumer must shift left
 /// by 6.
 ///
-/// See `scalar::p010_to_rgb_u16_row` for the full spec.
+/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec.
 /// `use_simd = false` forces the scalar reference.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
@@ -604,7 +604,7 @@ pub fn p010_to_rgb_u16_row(
         if neon_available() {
           // SAFETY: NEON verified.
           unsafe {
-            arch::neon::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -613,21 +613,21 @@ pub fn p010_to_rgb_u16_row(
         if avx512_available() {
           // SAFETY: AVX‑512BW verified.
           unsafe {
-            arch::x86_avx512::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
           unsafe {
-            arch::x86_avx2::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
           unsafe {
-            arch::x86_sse41::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+            arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
           }
           return;
         }
@@ -636,7 +636,7 @@ pub fn p010_to_rgb_u16_row(
         if simd128_available() {
           // SAFETY: simd128 compile‑time verified.
           unsafe {
-            arch::wasm_simd128::p010_to_rgb_u16_row(
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<10>(
               y, uv_half, rgb_out, width, matrix, full_range,
             );
           }
@@ -647,7 +647,7 @@ pub fn p010_to_rgb_u16_row(
     }
   }
 
-  scalar::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+  scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 527ea4d..26759c9 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -347,16 +347,17 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
 // ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------
 
 /// Converts one row of P010 (semi‑planar 4:2:0 with UV interleaved,
-/// 10 active bits in the **high** 10 of each `u16`) to **8‑bit**
-/// packed RGB.
+/// `BITS` active bits in the **high** `BITS` of each `u16`) to
+/// **8‑bit** packed RGB.
 ///
 /// Structurally identical to [`nv12_to_rgb_row`] plus the per‑sample
-/// shift: each `u16` load is extracted to its 10‑bit value via
-/// `sample >> 6`, then the same Q15 pipeline as
-/// [`yuv_420p_n_to_rgb_row`] runs with `BITS == 10`. Mispacked input
-/// — e.g. a `yuv420p10le` buffer with values in the **low** 10 bits
-/// — is masked down to a small positive number (producing near‑black
-/// output) rather than silent garbage, matching every SIMD backend.
+/// shift: each `u16` load is extracted to its `BITS`‑bit value via
+/// `sample >> (16 - BITS)`, then the same Q15 pipeline as
+/// [`yuv_420p_n_to_rgb_row`] runs with the same `BITS`. For `BITS ==
+/// 10` this is P010 (`>> 6`); for `BITS == 12` it's P012 (`>> 4`).
+/// Mispacked input — e.g. a low‑bit‑packed buffer handed to this
+/// kernel — has its active low bits discarded (producing near‑black
+/// output), matching every SIMD backend.
 ///
 /// # Panics (debug builds)
 ///
@@ -364,7 +365,7 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
 /// - `y.len() >= width`, `uv_half.len() >= width`,
 ///   `rgb_out.len() >= 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn p010_to_rgb_row(
+pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u8],
@@ -372,26 +373,28 @@ pub(crate) fn p010_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  debug_assert_eq!(width & 1, 0, "P010 requires even width");
+  debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range);
-  let bias = chroma_bias::<10>();
-
-  // Each `u16` load is converted to its 10-bit sample with `>> 6`,
-  // extracting the upper 10 bits and leaving the result in
-  // `[0, 1023]`. If low-packed input (`yuv420p10le`) is handed to
-  // this kernel by mistake, that shift discards the active low 6 bits
-  // rather than recovering the intended 10-bit value. No hot-path
-  // cost: one shift per load.
+  let (y_off, y_scale, c_scale) = range_params_n::<BITS, 8>(full_range);
+  let bias = chroma_bias::<BITS>();
+  let shift = 16 - BITS;
+
+  // Each `u16` load is converted to its `BITS`-bit sample with
+  // `>> (16 - BITS)` — 6 for P010, 4 for P012. Extracts the upper
+  // bits and leaves the result in `[0, (1 << BITS) - 1]`. If
+  // low-packed input (`yuv420p10le`, `yuv420p12le`) is handed to
+  // this kernel by mistake, the shift discards the active low bits
+  // rather than recovering the intended value. No hot-path cost:
+  // one shift per load.
   let mut x = 0;
   while x < width {
     let c_idx = x / 2;
-    let u_sample = uv_half[c_idx * 2] >> 6;
-    let v_sample = uv_half[c_idx * 2 + 1] >> 6;
+    let u_sample = uv_half[c_idx * 2] >> shift;
+    let v_sample = uv_half[c_idx * 2 + 1] >> shift;
     let u_d = q15_scale(u_sample as i32 - bias, c_scale);
     let v_d = q15_scale(v_sample as i32 - bias, c_scale);
 
@@ -399,12 +402,12 @@ pub(crate) fn p010_to_rgb_row(
     let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
-    let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale);
+    let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale);
     rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
     rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
     rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
 
-    let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale);
+    let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale);
     rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
     rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
     rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
@@ -413,15 +416,18 @@ pub(crate) fn p010_to_rgb_row(
   }
 }
 
-/// Converts one row of P010 to **native‑depth `u16`** packed RGB
-/// (10 active bits in the low bits of each `u16`, matching
-/// `yuv420p10le` convention — **not** P010's high‑bit packing).
+/// Converts one row of high‑bit‑packed semi‑planar 4:2:0
+/// (`BITS` ∈ {10, 12}: P010, P012) to **native‑depth `u16`**
+/// packed RGB — samples are **low‑bit‑packed** on output
+/// (`[0, (1 << BITS) - 1]` in the low bits of each `u16`, upper bits
+/// zero), matching the `yuv420p10le` / `yuv420p12le` convention —
+/// **not** the P010/P012 high‑bit packing. Callers feeding a P010/
+/// P012 consumer must shift the output left by `16 - BITS`.
 ///
-/// Mirrors [`yuv_420p_n_to_rgb_u16_row::<10>`] on the math side; the
-/// only difference is the input shift (`sample >> 6` instead of
-/// `sample & 0x3FF`) and the UV deinterleave. Output is suitable for
-/// direct consumption by downstream `yuv420p10le`‑shaped tooling. If
-/// you need P010‑packed RGB output, shift left by 6 on the caller.
+/// Mirrors [`yuv_420p_n_to_rgb_u16_row`] on the math side; the only
+/// differences are the input shift (`sample >> (16 - BITS)` to
+/// extract the `BITS`-bit value from the high-bit packing) and the
+/// interleaved UV layout.
 ///
 /// # Panics (debug builds)
 ///
@@ -429,7 +435,7 @@ pub(crate) fn p010_to_rgb_row(
 /// - `y.len() >= width`, `uv_half.len() >= width`,
 ///   `rgb_out.len() >= 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn p010_to_rgb_u16_row(
+pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
   y: &[u16],
   uv_half: &[u16],
   rgb_out: &mut [u16],
@@ -437,21 +443,22 @@ pub(crate) fn p010_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  debug_assert_eq!(width & 1, 0, "P010 requires even width");
+  debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
-  let (y_off, y_scale, c_scale) = range_params_n::<10, 10>(full_range);
-  let bias = chroma_bias::<10>();
-  let out_max: i32 = (1i32 << 10) - 1;
+  let (y_off, y_scale, c_scale) = range_params_n::<BITS, BITS>(full_range);
+  let bias = chroma_bias::<BITS>();
+  let out_max: i32 = (1i32 << BITS) - 1;
+  let shift = 16 - BITS;
 
   let mut x = 0;
   while x < width {
     let c_idx = x / 2;
-    let u_sample = uv_half[c_idx * 2] >> 6;
-    let v_sample = uv_half[c_idx * 2 + 1] >> 6;
+    let u_sample = uv_half[c_idx * 2] >> shift;
+    let v_sample = uv_half[c_idx * 2 + 1] >> shift;
     let u_d = q15_scale(u_sample as i32 - bias, c_scale);
     let v_d = q15_scale(v_sample as i32 - bias, c_scale);
 
@@ -459,12 +466,12 @@ pub(crate) fn p010_to_rgb_u16_row(
     let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
-    let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale);
+    let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale);
     rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
     rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
     rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
 
-    let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale);
+    let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale);
     rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
     rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
     rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
@@ -1131,7 +1138,7 @@ mod tests {
     let y = [0u16; 4];
     let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1
     let mut rgb = [0u8; 12];
-    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
     assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}");
   }
 
@@ -1141,7 +1148,7 @@ mod tests {
     let y = [0xFFC0u16; 4];
     let uv = [0x8000u16, 0x8000, 0x8000, 0x8000];
     let mut rgb = [0u8; 12];
-    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
     assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}");
   }
 
@@ -1151,7 +1158,7 @@ mod tests {
     let y = [0x8000u16; 4];
     let uv = [0x8000u16; 4];
     let mut rgb = [0u8; 12];
-    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
     for x in 0..4 {
       let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]);
       assert_eq!(r, g);
@@ -1167,7 +1174,7 @@ mod tests {
     let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00];
     let uv = [0x8000u16, 0x8000, 0x8000, 0x8000];
     let mut rgb = [0u8; 12];
-    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false);
+    p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false);
     assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
     assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0));
     assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255));
@@ -1196,7 +1203,7 @@ mod tests {
       ColorMatrix::Bt709,
       true,
     );
-    p010_to_rgb_row(
+    p_n_to_rgb_row::<10>(
       &y_p010,
       &uv_p010,
       &mut rgb_p010,
@@ -1214,7 +1221,7 @@ mod tests {
     let y = [0xFFC0u16; 4];
     let uv = [0x8000u16; 4];
     let mut rgb = [0u16; 12];
-    p010_to_rgb_u16_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
     assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}");
   }
 
@@ -1223,7 +1230,7 @@ mod tests {
     let y = [0x1000u16, 0xEB00];
     let uv = [0x8000u16, 0x8000];
     let mut rgb = [0u16; 6];
-    p010_to_rgb_u16_row(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false);
+    p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false);
     assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
     assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023));
   }

From 24c900f03381cffdfa1ea9ae90098974dafbb968 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 19 Apr 2026 22:45:29 +1200
Subject: [PATCH 2/4] more simd backend

---
 src/row/arch/wasm_simd128.rs | 54 ++++++++++++++--------------
 src/row/arch/x86_avx2.rs     | 69 ++++++++++++++++++------------------
 src/row/arch/x86_avx512.rs   | 62 ++++++++++++++++----------------
 src/row/arch/x86_sse41.rs    | 57 +++++++++++++++--------------
 4 files changed, 118 insertions(+), 124 deletions(-)

diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 35f068e..9a887e1 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -353,7 +353,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: simd128 compile‑time availability is the caller's
   // obligation.
@@ -364,7 +364,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let c_scale_v = i32x4_splat(c_scale);
     let bias_v = i16x8_splat(bias as i16);
     let mask_v = u16x8_splat(scalar::bits_mask::<BITS>());
-    let max_v = i16x8_splat(OUT_MAX_10);
+    let max_v = i16x8_splat(out_max);
     let zero_v = i16x8_splat(0);
     let cru = i32x4_splat(coeffs.r_u());
     let crv = i32x4_splat(coeffs.r_v());
@@ -554,13 +554,16 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
 
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr = (16 - BITS) as u32;
+
     let mut x = 0usize;
     while x + 16 <= width {
-      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6);
-      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6);
+      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr);
+      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr);
       let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x));
-      let u_vec = u16x8_shr(u_vec, 6);
-      let v_vec = u16x8_shr(v_vec, 6);
+      let u_vec = u16x8_shr(u_vec, shr);
+      let v_vec = u16x8_shr(v_vec, shr);
 
       let u_i16 = i16x8_sub(u_vec, bias_v);
       let v_i16 = i16x8_sub(v_vec, bias_v);
@@ -650,7 +653,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: simd128 compile‑time availability is the caller's
   // obligation.
@@ -660,7 +663,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let y_scale_v = i32x4_splat(y_scale);
     let c_scale_v = i32x4_splat(c_scale);
     let bias_v = i16x8_splat(bias as i16);
-    let max_v = i16x8_splat(OUT_MAX_10);
+    let max_v = i16x8_splat(out_max);
     let zero_v = i16x8_splat(0);
     let cru = i32x4_splat(coeffs.r_u());
     let crv = i32x4_splat(coeffs.r_v());
@@ -669,13 +672,16 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
 
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr = (16 - BITS) as u32;
+
     let mut x = 0usize;
     while x + 16 <= width {
-      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6);
-      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6);
+      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr);
+      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr);
       let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x));
-      let u_vec = u16x8_shr(u_vec, 6);
-      let v_vec = u16x8_shr(v_vec, 6);
+      let u_vec = u16x8_shr(u_vec, shr);
+      let v_vec = u16x8_shr(v_vec, shr);
 
       let u_i16 = i16x8_sub(u_vec, bias_v);
       let v_i16 = i16x8_sub(v_vec, bias_v);
@@ -1609,9 +1615,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1634,17 +1640,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-      &y,
-      &u,
-      &v,
-      &mut rgb_scalar,
-      width,
-      matrix,
-      full_range,
-    );
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1732,9 +1730,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges");
   }
@@ -1746,9 +1744,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges");
   }
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index a0f0e8d..c4e5a12 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -39,12 +39,13 @@
 //! element order. Every fixup is called out inline.
 
 use core::arch::x86_64::{
-  __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_and_si256,
-  _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256,
-  _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32,
-  _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16,
-  _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srli_epi16,
-  _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpacklo_epi16,
+  __m256i, _mm_cvtsi32_si128, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16,
+  _mm256_and_si256, _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16,
+  _mm256_extracti128_si256, _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16,
+  _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256,
+  _mm256_permute4x64_epi64, _mm256_set1_epi16, _mm256_set1_epi32, _mm256_setr_epi8,
+  _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srl_epi16, _mm256_sub_epi16,
+  _mm256_unpackhi_epi16, _mm256_unpacklo_epi16,
 };
 
 use crate::{
@@ -394,7 +395,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
@@ -404,7 +405,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let c_scale_v = _mm256_set1_epi32(c_scale);
     let bias_v = _mm256_set1_epi16(bias as i16);
     let mask_v = _mm256_set1_epi16(scalar::bits_mask::<BITS>() as i16);
-    let max_v = _mm256_set1_epi16(OUT_MAX_10);
+    let max_v = _mm256_set1_epi16(out_max);
     let zero_v = _mm256_set1_epi16(0);
     let cru = _mm256_set1_epi32(coeffs.r_u());
     let crv = _mm256_set1_epi32(coeffs.r_v());
@@ -573,6 +574,8 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let y_scale_v = _mm256_set1_epi32(y_scale);
     let c_scale_v = _mm256_set1_epi32(c_scale);
     let bias_v = _mm256_set1_epi16(bias as i16);
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm256_set1_epi32(coeffs.r_u());
     let crv = _mm256_set1_epi32(coeffs.r_v());
     let cgu = _mm256_set1_epi32(coeffs.g_u());
@@ -582,14 +585,15 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      // 32 Y = two u16×16 loads, shifted right by 6.
-      let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()));
+      // 32 Y = two u16×16 loads, shifted right by `16 - BITS`.
+      let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 =
+        _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count);
 
       // 32 UV (16 pairs) — deinterleave + shift.
       let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x));
-      let u_vec = _mm256_srli_epi16::<6>(u_vec);
-      let v_vec = _mm256_srli_epi16::<6>(v_vec);
+      let u_vec = _mm256_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm256_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
@@ -688,7 +692,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
@@ -697,8 +701,10 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let y_scale_v = _mm256_set1_epi32(y_scale);
     let c_scale_v = _mm256_set1_epi32(c_scale);
     let bias_v = _mm256_set1_epi16(bias as i16);
-    let max_v = _mm256_set1_epi16(OUT_MAX_10);
+    let max_v = _mm256_set1_epi16(out_max);
     let zero_v = _mm256_set1_epi16(0);
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm256_set1_epi32(coeffs.r_u());
     let crv = _mm256_set1_epi32(coeffs.r_v());
     let cgu = _mm256_set1_epi32(coeffs.g_u());
@@ -708,11 +714,12 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()));
+      let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 =
+        _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count);
       let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x));
-      let u_vec = _mm256_srli_epi16::<6>(u_vec);
-      let v_vec = _mm256_srli_epi16::<6>(v_vec);
+      let u_vec = _mm256_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm256_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
@@ -1608,9 +1615,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1636,17 +1643,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-      &y,
-      &u,
-      &v,
-      &mut rgb_scalar,
-      width,
-      matrix,
-      full_range,
-    );
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1737,9 +1736,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges");
   }
@@ -1754,9 +1753,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges");
   }
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 8e4ece2..6b0dbe9 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -53,13 +53,13 @@
 //!   two 32‑Y‑block‑aligned vectors from unpacklo + unpackhi.
 
 use core::arch::x86_64::{
-  __m128i, __m512i, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, _mm512_adds_epi16,
-  _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256,
-  _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32,
+  __m128i, __m512i, _mm_cvtsi32_si128, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32,
+  _mm512_adds_epi16, _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128,
+  _mm512_castsi512_si256, _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32,
   _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16,
   _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64,
   _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64,
-  _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srli_epi16, _mm512_sub_epi16,
+  _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srl_epi16, _mm512_sub_epi16,
   _mm512_unpackhi_epi16, _mm512_unpacklo_epi16,
 };
 
@@ -410,7 +410,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: AVX‑512BW availability is the caller's obligation.
   unsafe {
@@ -420,7 +420,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let c_scale_v = _mm512_set1_epi32(c_scale);
     let bias_v = _mm512_set1_epi16(bias as i16);
     let mask_v = _mm512_set1_epi16(scalar::bits_mask::<BITS>() as i16);
-    let max_v = _mm512_set1_epi16(OUT_MAX_10);
+    let max_v = _mm512_set1_epi16(out_max);
     let zero_v = _mm512_set1_epi16(0);
     let cru = _mm512_set1_epi32(coeffs.r_u());
     let crv = _mm512_set1_epi32(coeffs.r_v());
@@ -620,6 +620,8 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let y_scale_v = _mm512_set1_epi32(y_scale);
     let c_scale_v = _mm512_set1_epi32(c_scale);
     let bias_v = _mm512_set1_epi16(bias as i16);
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm512_set1_epi32(coeffs.r_u());
     let crv = _mm512_set1_epi32(coeffs.r_v());
     let cgu = _mm512_set1_epi32(coeffs.g_u());
@@ -633,11 +635,12 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 64 <= width {
-      let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()));
+      let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 =
+        _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count);
       let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x));
-      let u_vec = _mm512_srli_epi16::<6>(u_vec);
-      let v_vec = _mm512_srli_epi16::<6>(v_vec);
+      let u_vec = _mm512_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm512_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
@@ -736,7 +739,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: AVX‑512BW availability is the caller's obligation.
   unsafe {
@@ -745,8 +748,10 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let y_scale_v = _mm512_set1_epi32(y_scale);
     let c_scale_v = _mm512_set1_epi32(c_scale);
     let bias_v = _mm512_set1_epi16(bias as i16);
-    let max_v = _mm512_set1_epi16(OUT_MAX_10);
+    let max_v = _mm512_set1_epi16(out_max);
     let zero_v = _mm512_set1_epi16(0);
+    // High-bit-packed samples: shift right by `16 - BITS`.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm512_set1_epi32(coeffs.r_u());
     let crv = _mm512_set1_epi32(coeffs.r_v());
     let cgu = _mm512_set1_epi32(coeffs.g_u());
@@ -760,11 +765,12 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 64 <= width {
-      let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()));
+      let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 =
+        _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count);
       let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x));
-      let u_vec = _mm512_srli_epi16::<6>(u_vec);
-      let v_vec = _mm512_srli_epi16::<6>(v_vec);
+      let u_vec = _mm512_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm512_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
@@ -1645,9 +1651,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1673,17 +1679,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-      &y,
-      &u,
-      &v,
-      &mut rgb_scalar,
-      width,
-      matrix,
-      full_range,
-    );
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1774,9 +1772,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges");
   }
@@ -1791,9 +1789,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges");
   }
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index b91d98e..b81a060 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -241,6 +241,10 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let y_scale_v = _mm_set1_epi32(y_scale);
     let c_scale_v = _mm_set1_epi32(c_scale);
     let bias_v = _mm_set1_epi16(bias as i16);
+    // High-bit-packed samples: shift right by `16 - BITS` to extract
+    // the BITS-bit value. Loop-invariant, loaded once into the low 64b
+    // of `shr_count` for `_mm_srl_epi16`.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm_set1_epi32(coeffs.r_u());
     let crv = _mm_set1_epi32(coeffs.r_v());
     let cgu = _mm_set1_epi32(coeffs.g_u());
@@ -250,15 +254,15 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      // Y: two u16×8 loads, each shifted right by 6.
-      let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()));
+      // Y: two u16×8 loads, each shifted right by `16 - BITS`.
+      let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count);
 
       // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then
       // deinterleave into separate u_vec + v_vec.
       let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x));
-      let u_vec = _mm_srli_epi16::<6>(u_vec);
-      let v_vec = _mm_srli_epi16::<6>(v_vec);
+      let u_vec = _mm_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm_sub_epi16(v_vec, bias_v);
@@ -348,7 +352,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: SSE4.1 availability is the caller's obligation.
   unsafe {
@@ -357,8 +361,11 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let y_scale_v = _mm_set1_epi32(y_scale);
     let c_scale_v = _mm_set1_epi32(c_scale);
     let bias_v = _mm_set1_epi16(bias as i16);
-    let max_v = _mm_set1_epi16(OUT_MAX_10);
+    let max_v = _mm_set1_epi16(out_max);
     let zero_v = _mm_set1_epi16(0);
+    // High-bit-packed samples: shift right by `16 - BITS` to extract
+    // the BITS-bit value.
+    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
     let cru = _mm_set1_epi32(coeffs.r_u());
     let crv = _mm_set1_epi32(coeffs.r_v());
     let cgu = _mm_set1_epi32(coeffs.g_u());
@@ -368,11 +375,11 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast()));
-      let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()));
+      let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count);
+      let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count);
       let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x));
-      let u_vec = _mm_srli_epi16::<6>(u_vec);
-      let v_vec = _mm_srli_epi16::<6>(v_vec);
+      let u_vec = _mm_srl_epi16(u_vec, shr_count);
+      let v_vec = _mm_srl_epi16(v_vec, shr_count);
 
       let u_i16 = _mm_sub_epi16(u_vec, bias_v);
       let v_i16 = _mm_sub_epi16(v_vec, bias_v);
@@ -633,7 +640,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
   let bias = scalar::chroma_bias::<BITS>();
   const RND: i32 = 1 << 14;
-  const OUT_MAX_10: i16 = 1023;
+  let out_max: i16 = ((1i32 << BITS) - 1) as i16;
 
   // SAFETY: SSE4.1 availability is the caller's obligation.
   unsafe {
@@ -643,7 +650,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let c_scale_v = _mm_set1_epi32(c_scale);
     let bias_v = _mm_set1_epi16(bias as i16);
     let mask_v = _mm_set1_epi16(scalar::bits_mask::<BITS>() as i16);
-    let max_v = _mm_set1_epi16(OUT_MAX_10);
+    let max_v = _mm_set1_epi16(out_max);
     let zero_v = _mm_set1_epi16(0);
     let cru = _mm_set1_epi32(coeffs.r_u());
     let crv = _mm_set1_epi32(coeffs.r_v());
@@ -1387,9 +1394,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1415,17 +1422,9 @@ mod tests {
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
 
-    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-      &y,
-      &u,
-      &v,
-      &mut rgb_scalar,
-      width,
-      matrix,
-      full_range,
-    );
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+      yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
     }
 
     if rgb_scalar != rgb_simd {
@@ -1516,9 +1515,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u8; width * 3];
     let mut rgb_simd = std::vec![0u8; width * 3];
-    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges");
   }
@@ -1533,9 +1532,9 @@ mod tests {
     let uv = p010_uv_interleave(&u, &v);
     let mut rgb_scalar = std::vec![0u16; width * 3];
     let mut rgb_simd = std::vec![0u16; width * 3];
-    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
     unsafe {
-      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+      p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
     }
     assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges");
   }

From d917d8effb3eb5a3877231eda9c76fc8a59d2a7a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 19 Apr 2026 23:15:09 +1200
Subject: [PATCH 3/4] more simd backend

---
 Cargo.toml                   |  12 +
 benches/p012_to_rgb.rs       |  94 ++++++
 benches/yuv_420p12_to_rgb.rs | 100 ++++++
 benches/yuv_420p14_to_rgb.rs | 100 ++++++
 src/frame.rs                 | 152 ++++++---
 src/row/arch/neon.rs         | 152 +++++++++
 src/row/arch/wasm_simd128.rs | 155 ++++++++++
 src/row/arch/x86_avx2.rs     | 164 ++++++++++
 src/row/arch/x86_avx512.rs   | 167 ++++++++++
 src/row/arch/x86_sse41.rs    | 176 +++++++++++
 src/row/mod.rs               | 445 +++++++++++++++++++++++++++
 src/sinker/mixed.rs          | 579 ++++++++++++++++++++++++++++++++++-
 src/yuv/mod.rs               |  13 +
 src/yuv/p012.rs              | 152 +++++++++
 src/yuv/yuv420p12.rs         | 161 ++++++++++
 src/yuv/yuv420p14.rs         | 159 ++++++++++
 16 files changed, 2730 insertions(+), 51 deletions(-)
 create mode 100644 benches/p012_to_rgb.rs
 create mode 100644 benches/yuv_420p12_to_rgb.rs
 create mode 100644 benches/yuv_420p14_to_rgb.rs
 create mode 100644 src/yuv/p012.rs
 create mode 100644 src/yuv/yuv420p12.rs
 create mode 100644 src/yuv/yuv420p14.rs

diff --git a/Cargo.toml b/Cargo.toml
index 458d138..4c98087 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,10 +32,22 @@ harness = false
 name = "yuv_420p10_to_rgb"
 harness = false
 
+[[bench]]
+name = "yuv_420p12_to_rgb"
+harness = false
+
+[[bench]]
+name = "yuv_420p14_to_rgb"
+harness = false
+
 [[bench]]
 name = "p010_to_rgb"
 harness = false
 
+[[bench]]
+name = "p012_to_rgb"
+harness = false
+
 [[bench]]
 name = "rgb_to_hsv"
 harness = false
diff --git a/benches/p012_to_rgb.rs b/benches/p012_to_rgb.rs
new file mode 100644
index 0000000..9443f6f
--- /dev/null
+++ b/benches/p012_to_rgb.rs
@@ -0,0 +1,94 @@
+//! Per‑row P012 (semi‑planar 4:2:0, 12‑bit, high‑bit‑packed) → RGB
+//! throughput baseline.
+//!
+//! Mirrors [`p010_to_rgb`] but feeds 12‑bit high‑bit‑packed samples
+//! (12 active bits in the high 12 of each `u16`, low 4 bits zero).
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use std::hint::black_box;
+
+use colconv::{
+  ColorMatrix,
+  row::{p012_to_rgb_row, p012_to_rgb_u16_row},
+};
+
+/// Fills a `u16` buffer with a deterministic P012‑packed pseudo‑random
+/// sequence — 12‑bit values shifted into the high 12 bits of each
+/// `u16` (low 4 bits zero), matching the real P012 storage layout.
+fn fill_pseudo_random_p012(buf: &mut [u16], seed: u32) {
+  let mut state = seed;
+  for b in buf {
+    state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+    *b = (((state >> 8) & 0xFFF) as u16) << 4;
+  }
+}
+
+fn bench(c: &mut Criterion) {
+  const WIDTHS: &[usize] = &[1280, 1920, 3840];
+  const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl;
+  const FULL_RANGE: bool = false;
+
+  let mut group_u8 = c.benchmark_group("p012_to_rgb_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    // UV row payload is `width` u16 elements (w / 2 interleaved pairs).
+    let mut uv = std::vec![0u16; w];
+    fill_pseudo_random_p012(&mut y, 0x1111);
+    fill_pseudo_random_p012(&mut uv, 0x2222);
+    let mut rgb = std::vec![0u8; w * 3];
+
+    group_u8.throughput(Throughput::Bytes((w * 3) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u8_simd" } else { "u8_scalar" };
+      group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          p012_to_rgb_row(
+            black_box(&y),
+            black_box(&uv),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u8.finish();
+
+  let mut group_u16 = c.benchmark_group("p012_to_rgb_u16_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut uv = std::vec![0u16; w];
+    fill_pseudo_random_p012(&mut y, 0x1111);
+    fill_pseudo_random_p012(&mut uv, 0x2222);
+    let mut rgb = std::vec![0u16; w * 3];
+
+    group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u16_simd" } else { "u16_scalar" };
+      group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          p012_to_rgb_u16_row(
+            black_box(&y),
+            black_box(&uv),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u16.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/benches/yuv_420p12_to_rgb.rs b/benches/yuv_420p12_to_rgb.rs
new file mode 100644
index 0000000..cba3e28
--- /dev/null
+++ b/benches/yuv_420p12_to_rgb.rs
@@ -0,0 +1,100 @@
+//! Per‑row YUV 4:2:0 12‑bit → packed RGB throughput baseline.
+//!
+//! Mirrors [`yuv_420p10_to_rgb`] but feeds 12‑bit low‑bit‑packed
+//! samples (values ≤ 4095). Same `u8_*` / `u16_*` split per width so
+//! scalar vs SIMD speedup is a two‑line comparison in the Criterion
+//! report.
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use std::hint::black_box;
+
+use colconv::{
+  ColorMatrix,
+  row::{yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row},
+};
+
+/// Fills a `u16` buffer with a deterministic 12‑bit pseudo‑random
+/// sequence — values occupy the low 12 bits of each `u16`, matching
+/// the storage layout of `yuv420p12le`.
+fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) {
+  let mut state = seed;
+  for b in buf {
+    state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+    *b = ((state >> 8) & 0xFFF) as u16;
+  }
+}
+
+fn bench(c: &mut Criterion) {
+  const WIDTHS: &[usize] = &[1280, 1920, 3840];
+  const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl;
+  const FULL_RANGE: bool = false;
+
+  let mut group_u8 = c.benchmark_group("yuv420p12_to_rgb_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u8; w * 3];
+
+    group_u8.throughput(Throughput::Bytes((w * 3) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u8_simd" } else { "u8_scalar" };
+      group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p12_to_rgb_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u8.finish();
+
+  let mut group_u16 = c.benchmark_group("yuv420p12_to_rgb_u16_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u16; w * 3];
+
+    group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u16_simd" } else { "u16_scalar" };
+      group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p12_to_rgb_u16_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u16.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/benches/yuv_420p14_to_rgb.rs b/benches/yuv_420p14_to_rgb.rs
new file mode 100644
index 0000000..ac6e5ee
--- /dev/null
+++ b/benches/yuv_420p14_to_rgb.rs
@@ -0,0 +1,100 @@
+//! Per‑row YUV 4:2:0 14‑bit → packed RGB throughput baseline.
+//!
+//! Mirrors [`yuv_420p10_to_rgb`] but feeds 14‑bit low‑bit‑packed
+//! samples (values ≤ 16383). Same `u8_*` / `u16_*` split per width so
+//! scalar vs SIMD speedup is a two‑line comparison in the Criterion
+//! report.
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use std::hint::black_box;
+
+use colconv::{
+  ColorMatrix,
+  row::{yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row},
+};
+
+/// Fills a `u16` buffer with a deterministic 14‑bit pseudo‑random
+/// sequence — values occupy the low 14 bits of each `u16`, matching
+/// the storage layout of `yuv420p14le`.
+fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) {
+  let mut state = seed;
+  for b in buf {
+    state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+    *b = ((state >> 8) & 0x3FFF) as u16;
+  }
+}
+
+fn bench(c: &mut Criterion) {
+  const WIDTHS: &[usize] = &[1280, 1920, 3840];
+  const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl;
+  const FULL_RANGE: bool = false;
+
+  let mut group_u8 = c.benchmark_group("yuv420p14_to_rgb_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u8; w * 3];
+
+    group_u8.throughput(Throughput::Bytes((w * 3) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u8_simd" } else { "u8_scalar" };
+      group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p14_to_rgb_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u8.finish();
+
+  let mut group_u16 = c.benchmark_group("yuv420p14_to_rgb_u16_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u16; w * 3];
+
+    group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u16_simd" } else { "u16_scalar" };
+      group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p14_to_rgb_u16_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u16.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/src/frame.rs b/src/frame.rs
index 8becc63..2c1997b 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -519,7 +519,7 @@ pub enum Nv12FrameError {
 /// — wrong colors, but consistently wrong across scalar + every
 /// SIMD backend, which is visible in any output diff.
 #[derive(Debug, Clone, Copy)]
-pub struct P010Frame<'a> {
+pub struct PnFrame<'a, const BITS: u32> {
   y: &'a [u16],
   uv: &'a [u16],
   width: u32,
@@ -528,7 +528,7 @@ pub struct P010Frame<'a> {
   uv_stride: u32,
 }
 
-impl<'a> P010Frame<'a> {
+impl<'a, const BITS: u32> PnFrame<'a, BITS> {
   /// Constructs a new [`P010Frame`], validating dimensions and plane
   /// lengths. Strides are in `u16` **samples**.
   ///
@@ -548,19 +548,26 @@ impl<'a> P010Frame<'a> {
     height: u32,
     y_stride: u32,
     uv_stride: u32,
-  ) -> Result<Self, P010FrameError> {
+  ) -> Result<Self, PnFrameError> {
+    // Guard the `BITS` parameter at the top — 10 and 12 are the only
+    // high-bit-packed depths supported by the Q15 kernel family. 14
+    // exists in the planar `yuv420p14le` family but not as a Pn
+    // hardware output; 16 would need i64 intermediates.
+    if BITS != 10 && BITS != 12 {
+      return Err(PnFrameError::UnsupportedBits { bits: BITS });
+    }
     if width == 0 || height == 0 {
-      return Err(P010FrameError::ZeroDimension { width, height });
+      return Err(PnFrameError::ZeroDimension { width, height });
     }
     if width & 1 != 0 {
-      return Err(P010FrameError::OddWidth { width });
+      return Err(PnFrameError::OddWidth { width });
     }
     if y_stride < width {
-      return Err(P010FrameError::YStrideTooSmall { width, y_stride });
+      return Err(PnFrameError::YStrideTooSmall { width, y_stride });
     }
     let uv_row_elems = width;
     if uv_stride < uv_row_elems {
-      return Err(P010FrameError::UvStrideTooSmall {
+      return Err(PnFrameError::UvStrideTooSmall {
         uv_row_elems,
         uv_stride,
       });
@@ -569,14 +576,14 @@ impl<'a> P010Frame<'a> {
     let y_min = match (y_stride as usize).checked_mul(height as usize) {
       Some(v) => v,
       None => {
-        return Err(P010FrameError::GeometryOverflow {
+        return Err(PnFrameError::GeometryOverflow {
           stride: y_stride,
           rows: height,
         });
       }
     };
     if y.len() < y_min {
-      return Err(P010FrameError::YPlaneTooShort {
+      return Err(PnFrameError::YPlaneTooShort {
         expected: y_min,
         actual: y.len(),
       });
@@ -585,14 +592,14 @@ impl<'a> P010Frame<'a> {
     let uv_min = match (uv_stride as usize).checked_mul(chroma_height as usize) {
       Some(v) => v,
       None => {
-        return Err(P010FrameError::GeometryOverflow {
+        return Err(PnFrameError::GeometryOverflow {
           stride: uv_stride,
           rows: chroma_height,
         });
       }
     };
     if uv.len() < uv_min {
-      return Err(P010FrameError::UvPlaneTooShort {
+      return Err(PnFrameError::UvPlaneTooShort {
         expected: uv_min,
         actual: uv.len(),
       });
@@ -621,7 +628,7 @@ impl<'a> P010Frame<'a> {
   ) -> Self {
     match Self::try_new(y, uv, width, height, y_stride, uv_stride) {
       Ok(frame) => frame,
-      Err(_) => panic!("invalid P010Frame dimensions or plane lengths"),
+      Err(_) => panic!("invalid PnFrame dimensions, plane lengths, or BITS value"),
     }
   }
 
@@ -646,7 +653,7 @@ impl<'a> P010Frame<'a> {
   /// Cost: one O(plane_size) scan per plane. The default
   /// [`Self::try_new`] skips this so the hot path stays O(1).
   ///
-  /// Returns [`P010FrameError::SampleLowBitsSet`] on the first
+  /// Returns [`PnFrameError::SampleLowBitsSet`] on the first
   /// offending sample — carries the plane, element index, and
   /// offending value.
   #[cfg_attr(not(tarpaulin), inline(always))]
@@ -657,8 +664,10 @@ impl<'a> P010Frame<'a> {
     height: u32,
     y_stride: u32,
     uv_stride: u32,
-  ) -> Result<Self, P010FrameError> {
+  ) -> Result<Self, PnFrameError> {
     let frame = Self::try_new(y, uv, width, height, y_stride, uv_stride)?;
+    let low_bits = 16 - BITS;
+    let low_mask: u16 = ((1u32 << low_bits) - 1) as u16;
     let w = width as usize;
     let h = height as usize;
     let uv_w = w; // interleaved: `width / 2` pairs × 2 elements
@@ -666,11 +675,12 @@ impl<'a> P010Frame<'a> {
     for row in 0..h {
       let start = row * y_stride as usize;
       for (col, &s) in y[start..start + w].iter().enumerate() {
-        if s & 0x3F != 0 {
-          return Err(P010FrameError::SampleLowBitsSet {
-            plane: P010FramePlane::Y,
+        if s & low_mask != 0 {
+          return Err(PnFrameError::SampleLowBitsSet {
+            plane: PnFramePlane::Y,
             index: start + col,
             value: s,
+            low_bits,
           });
         }
       }
@@ -678,11 +688,12 @@ impl<'a> P010Frame<'a> {
     for row in 0..chroma_h {
       let start = row * uv_stride as usize;
       for (col, &s) in uv[start..start + uv_w].iter().enumerate() {
-        if s & 0x3F != 0 {
-          return Err(P010FrameError::SampleLowBitsSet {
-            plane: P010FramePlane::Uv,
+        if s & low_mask != 0 {
+          return Err(PnFrameError::SampleLowBitsSet {
+            plane: PnFramePlane::Uv,
             index: start + col,
             value: s,
+            low_bits,
           });
         }
       }
@@ -730,23 +741,51 @@ impl<'a> P010Frame<'a> {
   pub const fn uv_stride(&self) -> u32 {
     self.uv_stride
   }
+
+  /// Active bit depth — 10 or 12. Mirrors the `BITS` const parameter
+  /// so generic code can read it without naming the type.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn bits(&self) -> u32 {
+    BITS
+  }
 }
 
-/// Identifies which plane of a [`P010Frame`] a
-/// [`P010FrameError::SampleLowBitsSet`] refers to.
+/// Type alias for a validated P010 frame (10‑bit, high‑bit‑packed).
+/// Use this name at call sites for readability.
+pub type P010Frame<'a> = PnFrame<'a, 10>;
+
+/// Type alias for a validated P012 frame (12‑bit, high‑bit‑packed).
+/// Same layout as [`P010Frame`] but with 12 active bits in the high
+/// 12 of each `u16` (`sample = value << 4`, low 4 bits zero).
+pub type P012Frame<'a> = PnFrame<'a, 12>;
+
+/// Identifies which plane of a [`PnFrame`] a
+/// [`PnFrameError::SampleLowBitsSet`] refers to.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
-pub enum P010FramePlane {
+pub enum PnFramePlane {
   /// Luma plane.
   Y,
   /// Interleaved UV plane.
   Uv,
 }
 
-/// Errors returned by [`P010Frame::try_new`] and
-/// [`P010Frame::try_new_checked`].
+/// Back‑compat alias for the pre‑generalization plane enum name.
+pub type P010FramePlane = PnFramePlane;
+
+/// Errors returned by [`PnFrame::try_new`] and
+/// [`PnFrame::try_new_checked`].
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)]
 #[non_exhaustive]
-pub enum P010FrameError {
+pub enum PnFrameError {
+  /// `BITS` was not one of the supported high‑bit‑packed depths
+  /// (10, 12). 14 exists in the planar `yuv420p14le` family but not
+  /// as a Pn hardware output; 16 would need a different kernel
+  /// family.
+  #[error("unsupported BITS ({bits}) for PnFrame; must be 10 or 12")]
+  UnsupportedBits {
+    /// The unsupported value of the `BITS` const parameter.
+    bits: u32,
+  },
   /// `width` or `height` was zero.
   #[error("width ({width}) or height ({height}) is zero")]
   ZeroDimension {
@@ -803,29 +842,34 @@ pub enum P010FrameError {
     /// Row count that overflowed against the stride.
     rows: u32,
   },
-  /// A sample's low 6 bits were non‑zero — P010 packs its 10 active
-  /// bits in the high 10 of each `u16`, so valid samples are always
-  /// multiples of 64 (`value << 6`). Only
-  /// [`P010Frame::try_new_checked`] can produce this error.
+  /// A sample's low `16 - BITS` bits were non‑zero — a Pn sample
+  /// packs its `BITS` active bits in the high `BITS` of each `u16`,
+  /// so valid samples are always multiples of `1 << (16 - BITS)`
+  /// (64 for 10‑bit, 16 for 12‑bit). Only
+  /// [`PnFrame::try_new_checked`] can produce this error.
   ///
   /// Note: the absence of this error does **not** prove the buffer
-  /// is P010. A `yuv420p10le` buffer of samples that all happen to
-  /// be multiples of 64 (e.g. `Y = 64`, `UV = 512`) passes the
-  /// check silently. See [`P010Frame::try_new_checked`] for the
-  /// full discussion.
+  /// is Pn. A low‑bit‑packed buffer of samples that all happen to be
+  /// multiples of `1 << (16 - BITS)` passes the check silently. See
+  /// [`PnFrame::try_new_checked`] for the full discussion.
   #[error(
-    "sample {value:#06x} on plane {plane} at element {index} has non-zero low 6 bits (not a valid P010 sample)"
+    "sample {value:#06x} on plane {plane} at element {index} has non-zero low {low_bits} bits (not a valid Pn sample at the declared BITS)"
   )]
   SampleLowBitsSet {
     /// Which plane the offending sample lives on.
-    plane: P010FramePlane,
+    plane: PnFramePlane,
     /// Element index within that plane's slice.
     index: usize,
     /// The offending sample value.
     value: u16,
+    /// Number of low bits expected to be zero (`16 - BITS`).
+    low_bits: u32,
   },
 }
 
+/// Back‑compat alias for the pre‑generalization error enum name.
+pub type P010FrameError = PnFrameError;
+
 /// A validated NV21 (semi‑planar 4:2:0) frame.
 ///
 /// Structurally identical to [`Nv12Frame`] — one full-size luma plane
@@ -1411,6 +1455,20 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> {
 /// for readability.
 pub type Yuv420p10Frame<'a> = Yuv420pFrame16<'a, 10>;
 
+/// Type alias for a validated YUV 4:2:0 planar frame at 12 bits per
+/// sample (`AV_PIX_FMT_YUV420P12LE`). Tight wrapper over
+/// [`Yuv420pFrame16`] with `BITS == 12` — same low‑bit‑packed `u16`
+/// layout as [`Yuv420p10Frame`], just with 12 active bits in the
+/// low 12 of each element (upper 4 bits zero).
+pub type Yuv420p12Frame<'a> = Yuv420pFrame16<'a, 12>;
+
+/// Type alias for a validated YUV 4:2:0 planar frame at 14 bits per
+/// sample (`AV_PIX_FMT_YUV420P14LE`). Tight wrapper over
+/// [`Yuv420pFrame16`] with `BITS == 14` — same low‑bit‑packed `u16`
+/// layout as [`Yuv420p10Frame`], just with 14 active bits in the
+/// low 14 of each element (upper 2 bits zero).
+pub type Yuv420p14Frame<'a> = Yuv420pFrame16<'a, 14>;
+
 /// Errors returned by [`Yuv420pFrame16::try_new`]. Variant shape
 /// mirrors [`Yuv420pFrameError`], with `UnsupportedBits` added for
 /// the new `BITS` parameter and all sizes expressed in **samples**
@@ -2163,28 +2221,28 @@ mod tests {
   fn p010_try_new_rejects_odd_width() {
     let (y, uv) = p010_planes();
     let e = P010Frame::try_new(&y, &uv, 15, 8, 16, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::OddWidth { width: 15 }));
+    assert!(matches!(e, PnFrameError::OddWidth { width: 15 }));
   }
 
   #[test]
   fn p010_try_new_rejects_zero_dim() {
     let (y, uv) = p010_planes();
     let e = P010Frame::try_new(&y, &uv, 0, 8, 16, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::ZeroDimension { .. }));
+    assert!(matches!(e, PnFrameError::ZeroDimension { .. }));
   }
 
   #[test]
   fn p010_try_new_rejects_y_stride_under_width() {
     let (y, uv) = p010_planes();
     let e = P010Frame::try_new(&y, &uv, 16, 8, 8, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::YStrideTooSmall { .. }));
+    assert!(matches!(e, PnFrameError::YStrideTooSmall { .. }));
   }
 
   #[test]
   fn p010_try_new_rejects_uv_stride_under_width() {
     let (y, uv) = p010_planes();
     let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 8).unwrap_err();
-    assert!(matches!(e, P010FrameError::UvStrideTooSmall { .. }));
+    assert!(matches!(e, PnFrameError::UvStrideTooSmall { .. }));
   }
 
   #[test]
@@ -2192,7 +2250,7 @@ mod tests {
     let y = std::vec![0u16; 10];
     let uv = std::vec![0x8000u16; 16 * 4];
     let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::YPlaneTooShort { .. }));
+    assert!(matches!(e, PnFrameError::YPlaneTooShort { .. }));
   }
 
   #[test]
@@ -2200,11 +2258,11 @@ mod tests {
     let y = std::vec![0u16; 16 * 8];
     let uv = std::vec![0x8000u16; 8];
     let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::UvPlaneTooShort { .. }));
+    assert!(matches!(e, PnFrameError::UvPlaneTooShort { .. }));
   }
 
   #[test]
-  #[should_panic(expected = "invalid P010Frame")]
+  #[should_panic(expected = "invalid PnFrame")]
   fn p010_new_panics_on_invalid() {
     let y = std::vec![0u16; 10];
     let uv = std::vec![0x8000u16; 16 * 4];
@@ -2218,7 +2276,7 @@ mod tests {
     let y: [u16; 0] = [];
     let uv: [u16; 0] = [];
     let e = P010Frame::try_new(&y, &uv, big, big, big, big).unwrap_err();
-    assert!(matches!(e, P010FrameError::GeometryOverflow { .. }));
+    assert!(matches!(e, PnFrameError::GeometryOverflow { .. }));
   }
 
   #[test]
@@ -2239,7 +2297,7 @@ mod tests {
     let uv = std::vec![0x8000u16; 16 * 4];
     let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
     match e {
-      P010FrameError::SampleLowBitsSet { plane, value, .. } => {
+      PnFrameError::SampleLowBitsSet { plane, value, .. } => {
         assert_eq!(plane, P010FramePlane::Y);
         assert_eq!(value, 0x03FF);
       }
@@ -2255,7 +2313,7 @@ mod tests {
     let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
     assert!(matches!(
       e,
-      P010FrameError::SampleLowBitsSet {
+      PnFrameError::SampleLowBitsSet {
         plane: P010FramePlane::Uv,
         value: 0x0001,
         ..
@@ -2268,7 +2326,7 @@ mod tests {
     let y = std::vec![0u16; 10]; // Too small.
     let uv = std::vec![0x8000u16; 16 * 4];
     let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
-    assert!(matches!(e, P010FrameError::YPlaneTooShort { .. }));
+    assert!(matches!(e, PnFrameError::YPlaneTooShort { .. }));
   }
 
   /// Regression documenting a **known limitation** of
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 54e7a07..f8ceba2 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -2075,4 +2075,156 @@ mod tests {
       }
     }
   }
+
+  // ---- Generic BITS equivalence (12/14-bit coverage) ------------------
+
+  fn planar_n_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16)
+      .collect()
+  }
+
+  fn p_n_packed_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let shift = 16 - BITS;
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift)
+      .collect()
+  }
+
+  fn check_planar_u8_neon_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_neon = std::vec![0u8; width * 3];
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_neon, "NEON planar {BITS}-bit → u8 diverges");
+  }
+
+  fn check_planar_u16_neon_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_neon = std::vec![0u16; width * 3];
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
+    unsafe {
+      yuv_420p_n_to_rgb_u16_row::<BITS>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_neon,
+      "NEON planar {BITS}-bit → u16 diverges"
+    );
+  }
+
+  fn check_pn_u8_neon_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_neon = std::vec![0u8; width * 3];
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u8 diverges");
+  }
+
+  fn check_pn_u16_neon_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_neon = std::vec![0u16; width * 3];
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u16 diverges");
+  }
+
+  #[test]
+  fn neon_p12_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_neon_equivalence_n::<12>(16, m, full);
+        check_planar_u16_neon_equivalence_n::<12>(16, m, full);
+        check_pn_u8_neon_equivalence_n::<12>(16, m, full);
+        check_pn_u16_neon_equivalence_n::<12>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p14_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_neon_equivalence_n::<14>(16, m, full);
+        check_planar_u16_neon_equivalence_n::<14>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p12_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+      check_pn_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_pn_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    }
+  }
+
+  #[test]
+  fn neon_p14_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_neon_equivalence_n::<14>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_neon_equivalence_n::<14>(w, ColorMatrix::Bt709, true);
+    }
+  }
 }
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 9a887e1..f0619a9 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -1796,4 +1796,159 @@ mod tests {
     check_p010_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false);
     check_p010_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- Generic BITS equivalence (12/14-bit coverage) ------------------
+
+  fn planar_n_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16)
+      .collect()
+  }
+
+  fn p_n_packed_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let shift = 16 - BITS;
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift)
+      .collect()
+  }
+
+  fn check_planar_u8_simd128_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "simd128 planar {BITS}-bit → u8 diverges"
+    );
+  }
+
+  fn check_planar_u16_simd128_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
+    unsafe {
+      yuv_420p_n_to_rgb_u16_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "simd128 planar {BITS}-bit → u16 diverges"
+    );
+  }
+
+  fn check_pn_u8_simd128_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u8 diverges");
+  }
+
+  fn check_pn_u16_simd128_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u16 diverges");
+  }
+
+  #[test]
+  fn simd128_p12_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_simd128_equivalence_n::<12>(16, m, full);
+        check_planar_u16_simd128_equivalence_n::<12>(16, m, full);
+        check_pn_u8_simd128_equivalence_n::<12>(16, m, full);
+        check_pn_u16_simd128_equivalence_n::<12>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p14_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_simd128_equivalence_n::<14>(16, m, full);
+        check_planar_u16_simd128_equivalence_n::<14>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p12_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+      check_pn_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_pn_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    }
+  }
+
+  #[test]
+  fn simd128_p14_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_simd128_equivalence_n::<14>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_simd128_equivalence_n::<14>(w, ColorMatrix::Bt709, true);
+    }
+  }
 }
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index c4e5a12..16deb67 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -1805,4 +1805,168 @@ mod tests {
     check_p010_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false);
     check_p010_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- Generic BITS equivalence (12/14-bit coverage) ------------------
+
+  fn planar_n_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16)
+      .collect()
+  }
+
+  fn p_n_packed_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let shift = 16 - BITS;
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift)
+      .collect()
+  }
+
+  fn check_planar_u8_avx2_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX2 planar {BITS}-bit → u8 diverges");
+  }
+
+  fn check_planar_u16_avx2_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
+    unsafe {
+      yuv_420p_n_to_rgb_u16_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "AVX2 planar {BITS}-bit → u16 diverges"
+    );
+  }
+
+  fn check_pn_u8_avx2_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u8 diverges");
+  }
+
+  fn check_pn_u16_avx2_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u16 diverges");
+  }
+
+  #[test]
+  fn avx2_p12_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_avx2_equivalence_n::<12>(32, m, full);
+        check_planar_u16_avx2_equivalence_n::<12>(32, m, full);
+        check_pn_u8_avx2_equivalence_n::<12>(32, m, full);
+        check_pn_u16_avx2_equivalence_n::<12>(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p14_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_avx2_equivalence_n::<14>(32, m, full);
+        check_planar_u16_avx2_equivalence_n::<14>(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p12_matches_scalar_tail_widths() {
+    for w in [34usize, 62, 66, 1922] {
+      check_planar_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+      check_pn_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_pn_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    }
+  }
+
+  #[test]
+  fn avx2_p14_matches_scalar_tail_widths() {
+    for w in [34usize, 62, 66, 1922] {
+      check_planar_u8_avx2_equivalence_n::<14>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_avx2_equivalence_n::<14>(w, ColorMatrix::Bt709, true);
+    }
+  }
 }
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 6b0dbe9..238a09a 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1841,4 +1841,171 @@ mod tests {
     check_p010_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false);
     check_p010_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- Generic BITS equivalence (12/14-bit coverage) ------------------
+
+  fn planar_n_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16)
+      .collect()
+  }
+
+  fn p_n_packed_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let shift = 16 - BITS;
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift)
+      .collect()
+  }
+
+  fn check_planar_u8_avx512_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "AVX-512 planar {BITS}-bit → u8 diverges"
+    );
+  }
+
+  fn check_planar_u16_avx512_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
+    unsafe {
+      yuv_420p_n_to_rgb_u16_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "AVX-512 planar {BITS}-bit → u16 diverges"
+    );
+  }
+
+  fn check_pn_u8_avx512_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u8 diverges");
+  }
+
+  fn check_pn_u16_avx512_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u16 diverges");
+  }
+
+  #[test]
+  fn avx512_p12_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_avx512_equivalence_n::<12>(64, m, full);
+        check_planar_u16_avx512_equivalence_n::<12>(64, m, full);
+        check_pn_u8_avx512_equivalence_n::<12>(64, m, full);
+        check_pn_u16_avx512_equivalence_n::<12>(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p14_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_avx512_equivalence_n::<14>(64, m, full);
+        check_planar_u16_avx512_equivalence_n::<14>(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p12_matches_scalar_tail_widths() {
+    for w in [66usize, 126, 130, 1922] {
+      check_planar_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+      check_pn_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_pn_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    }
+  }
+
+  #[test]
+  fn avx512_p14_matches_scalar_tail_widths() {
+    for w in [66usize, 126, 130, 1922] {
+      check_planar_u8_avx512_equivalence_n::<14>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_avx512_equivalence_n::<14>(w, ColorMatrix::Bt709, true);
+    }
+  }
 }
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index b81a060..1dd5f2d 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1584,4 +1584,180 @@ mod tests {
     check_p010_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false);
     check_p010_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- Generic BITS equivalence (12/14-bit coverage) ------------------
+  //
+  // The helpers below parameterize over `const BITS: u32` so the same
+  // scalar-equivalence scaffolding covers 10/12/14 without duplicating
+  // the 16-pixel block seeding + diff harness. `<10>` is already
+  // exercised by the dedicated tests above; `<12>` / `<14>` add
+  // regression coverage for the new yuv420p12 / yuv420p14 / P012
+  // kernels. 14-bit is planar-only (no P014 in Ship 4a).
+
+  fn planar_n_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16)
+      .collect()
+  }
+
+  fn p_n_packed_plane<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let shift = 16 - BITS;
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift)
+      .collect()
+  }
+
+  fn check_planar_u8_sse41_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420p_n_to_rgb_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "SSE4.1 planar {BITS}-bit → u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  fn check_planar_u16_sse41_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = planar_n_plane::<BITS>(width, 37);
+    let u = planar_n_plane::<BITS>(width / 2, 53);
+    let v = planar_n_plane::<BITS>(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &mut rgb_scalar,
+      width,
+      matrix,
+      full_range,
+    );
+    unsafe {
+      yuv_420p_n_to_rgb_u16_row::<BITS>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgb_scalar, rgb_simd,
+      "SSE4.1 planar {BITS}-bit → u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  fn check_pn_u8_sse41_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u8 diverges");
+  }
+
+  fn check_pn_u16_sse41_equivalence_n<const BITS: u32>(
+    width: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p_n_packed_plane::<BITS>(width, 37);
+    let u = p_n_packed_plane::<BITS>(width / 2, 53);
+    let v = p_n_packed_plane::<BITS>(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p_n_to_rgb_u16_row::<BITS>(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u16 diverges");
+  }
+
+  #[test]
+  fn sse41_p12_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_sse41_equivalence_n::<12>(16, m, full);
+        check_planar_u16_sse41_equivalence_n::<12>(16, m, full);
+        check_pn_u8_sse41_equivalence_n::<12>(16, m, full);
+        check_pn_u16_sse41_equivalence_n::<12>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p14_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_planar_u8_sse41_equivalence_n::<14>(16, m, full);
+        check_planar_u16_sse41_equivalence_n::<14>(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p12_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+      check_pn_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false);
+      check_pn_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    }
+  }
+
+  #[test]
+  fn sse41_p14_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_planar_u8_sse41_equivalence_n::<14>(w, ColorMatrix::Bt601, false);
+      check_planar_u16_sse41_equivalence_n::<14>(w, ColorMatrix::Bt709, true);
+    }
+  }
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index f6257fc..1201ac6 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -650,6 +650,451 @@ pub fn p010_to_rgb_u16_row(
   scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 12 active bits in the low 12 bits of each
+/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed
+/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The
+/// native‑depth path is [`yuv420p12_to_rgb_u16_row`].
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching
+/// `yuv420p12le` convention — upper 4 bits zero).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (14‑bit values in the low 14 of each `u16`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
+/// packed — 12 active bits in the high 12 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and
+/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but
+/// `>> 4` instead of `>> 6` at each `u16` load.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** to **native‑depth `u16`** packed RGB
+/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed
+/// `yuv420p12le` convention, **not** P012's high‑bit packing).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<12>(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
 /// encoding). See `scalar::rgb_to_hsv_row` for semantics.
 ///
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 92835a5..5877255 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -19,12 +19,15 @@ use thiserror::Error;
 use crate::{
   HsvBuffers, PixelSink, SourceFormat,
   row::{
-    nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, rgb_to_hsv_row,
-    yuv_420_to_rgb_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row,
+    nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row,
+    p012_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row,
+    yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row,
+    yuv420p14_to_rgb_u16_row,
   },
   yuv::{
-    Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, Yuv420p, Yuv420p10,
-    Yuv420p10Row, Yuv420p10Sink, Yuv420pRow, Yuv420pSink,
+    Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, P012, P012Row,
+    P012Sink, Yuv420p, Yuv420p10, Yuv420p10Row, Yuv420p10Sink, Yuv420p12, Yuv420p12Row,
+    Yuv420p12Sink, Yuv420p14, Yuv420p14Row, Yuv420p14Sink, Yuv420pRow, Yuv420pSink,
   },
 };
 
@@ -225,6 +228,35 @@ pub enum RowSlice {
   /// bits sit in the high 10 of its `u16`).
   #[display("UV Half 10")]
   UvHalf10,
+  /// Full‑width Y row of a **12‑bit** planar source ([`Yuv420p12`]).
+  /// `u16` samples, `width` elements, low‑bit‑packed.
+  #[display("Y12")]
+  Y12,
+  /// Half‑width U row of a **12‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("U Half 12")]
+  UHalf12,
+  /// Half‑width V row of a **12‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("V Half 12")]
+  VHalf12,
+  /// Half‑width interleaved UV row of a **12‑bit semi‑planar** source
+  /// ([`P012`]). `u16` samples, `width` elements (high‑bit‑packed: 12
+  /// active bits in the high 12 of each `u16`).
+  #[display("UV Half 12")]
+  UvHalf12,
+  /// Full‑width Y row of a **14‑bit** planar source ([`Yuv420p14`]).
+  /// `u16` samples, `width` elements, low‑bit‑packed.
+  #[display("Y14")]
+  Y14,
+  /// Half‑width U row of a **14‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("U Half 14")]
+  UHalf14,
+  /// Half‑width V row of a **14‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("V Half 14")]
+  VHalf14,
 }
 
 /// A sink that writes any subset of `{RGB, Luma, HSV}` into
@@ -1297,6 +1329,545 @@ impl PixelSink for MixedSinker<'_, P010> {
   }
 }
 
+// ---- Yuv420p12 impl ----------------------------------------------------
+
+impl<'a> MixedSinker<'a, Yuv420p12> {
+  /// Attaches a packed **`u16`** RGB output buffer. Mirrors
+  /// [`MixedSinker<Yuv420p10>::with_rgb_u16`] but produces 12‑bit
+  /// output (values in `[0, 4095]` in the low 12 of each `u16`, upper
+  /// 4 zero). Length is measured in `u16` **elements** (`width ×
+  /// height × 3`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgb_u16(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected_elements = self.frame_bytes(3)?;
+    if buf.len() < expected_elements {
+      return Err(MixedSinkerError::RgbU16BufferTooShort {
+        expected: expected_elements,
+        actual: buf.len(),
+      });
+    }
+    self.rgb_u16 = Some(buf);
+    Ok(self)
+  }
+}
+
+impl Yuv420p12Sink for MixedSinker<'_, Yuv420p12> {}
+
+impl PixelSink for MixedSinker<'_, Yuv420p12> {
+  type Input<'r> = Yuv420p12Row<'r>;
+  type Error = MixedSinkerError;
+
+  fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> {
+    if self.width & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: self.width });
+    }
+    check_dimensions_match(self.width, self.height, width, height)
+  }
+
+  fn process(&mut self, row: Yuv420p12Row<'_>) -> Result<(), Self::Error> {
+    // Bit depth is fixed by the format (12) — declared as a const so
+    // the downshift for u8 luma stays obvious at the call site.
+    const BITS: u32 = 12;
+
+    let w = self.width;
+    let h = self.height;
+    let idx = row.row();
+    let use_simd = self.simd;
+
+    if w & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: w });
+    }
+    if row.y().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::Y12,
+        row: idx,
+        expected: w,
+        actual: row.y().len(),
+      });
+    }
+    if row.u_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::UHalf12,
+        row: idx,
+        expected: w / 2,
+        actual: row.u_half().len(),
+      });
+    }
+    if row.v_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::VHalf12,
+        row: idx,
+        expected: w / 2,
+        actual: row.v_half().len(),
+      });
+    }
+    if idx >= self.height {
+      return Err(MixedSinkerError::RowIndexOutOfRange {
+        row: idx,
+        configured_height: self.height,
+      });
+    }
+
+    let Self {
+      rgb,
+      rgb_u16,
+      luma,
+      hsv,
+      rgb_scratch,
+      ..
+    } = self;
+
+    let one_plane_start = idx * w;
+    let one_plane_end = one_plane_start + w;
+
+    if let Some(luma) = luma.as_deref_mut() {
+      let dst = &mut luma[one_plane_start..one_plane_end];
+      for (d, &s) in dst.iter_mut().zip(row.y().iter()) {
+        *d = (s >> (BITS - 8)) as u8;
+      }
+    }
+
+    if let Some(buf) = rgb_u16.as_deref_mut() {
+      let rgb_plane_end =
+        one_plane_end
+          .checked_mul(3)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 3,
+          })?;
+      let rgb_plane_start = one_plane_start * 3;
+      yuv420p12_to_rgb_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        &mut buf[rgb_plane_start..rgb_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
+    let want_rgb = rgb.is_some();
+    let want_hsv = hsv.is_some();
+    if !want_rgb && !want_hsv {
+      return Ok(());
+    }
+
+    let rgb_row: &mut [u8] = match rgb.as_deref_mut() {
+      Some(buf) => {
+        let rgb_plane_end =
+          one_plane_end
+            .checked_mul(3)
+            .ok_or(MixedSinkerError::GeometryOverflow {
+              width: w,
+              height: h,
+              channels: 3,
+            })?;
+        let rgb_plane_start = one_plane_start * 3;
+        &mut buf[rgb_plane_start..rgb_plane_end]
+      }
+      None => {
+        let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow {
+          width: w,
+          height: h,
+          channels: 3,
+        })?;
+        if rgb_scratch.len() < rgb_row_bytes {
+          rgb_scratch.resize(rgb_row_bytes, 0);
+        }
+        &mut rgb_scratch[..rgb_row_bytes]
+      }
+    };
+
+    yuv420p12_to_rgb_row(
+      row.y(),
+      row.u_half(),
+      row.v_half(),
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+    );
+
+    if let Some(hsv) = hsv.as_mut() {
+      rgb_to_hsv_row(
+        rgb_row,
+        &mut hsv.h[one_plane_start..one_plane_end],
+        &mut hsv.s[one_plane_start..one_plane_end],
+        &mut hsv.v[one_plane_start..one_plane_end],
+        w,
+        use_simd,
+      );
+    }
+    Ok(())
+  }
+}
+
+// ---- Yuv420p14 impl ----------------------------------------------------
+
+impl<'a> MixedSinker<'a, Yuv420p14> {
+  /// Attaches a packed **`u16`** RGB output buffer. Produces 14‑bit
+  /// output (values in `[0, 16383]` in the low 14 of each `u16`, upper
+  /// 2 zero). Length is measured in `u16` **elements** (`width ×
+  /// height × 3`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgb_u16(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected_elements = self.frame_bytes(3)?;
+    if buf.len() < expected_elements {
+      return Err(MixedSinkerError::RgbU16BufferTooShort {
+        expected: expected_elements,
+        actual: buf.len(),
+      });
+    }
+    self.rgb_u16 = Some(buf);
+    Ok(self)
+  }
+}
+
+impl Yuv420p14Sink for MixedSinker<'_, Yuv420p14> {}
+
+impl PixelSink for MixedSinker<'_, Yuv420p14> {
+  type Input<'r> = Yuv420p14Row<'r>;
+  type Error = MixedSinkerError;
+
+  fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> {
+    if self.width & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: self.width });
+    }
+    check_dimensions_match(self.width, self.height, width, height)
+  }
+
+  fn process(&mut self, row: Yuv420p14Row<'_>) -> Result<(), Self::Error> {
+    const BITS: u32 = 14;
+
+    let w = self.width;
+    let h = self.height;
+    let idx = row.row();
+    let use_simd = self.simd;
+
+    if w & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: w });
+    }
+    if row.y().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::Y14,
+        row: idx,
+        expected: w,
+        actual: row.y().len(),
+      });
+    }
+    if row.u_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::UHalf14,
+        row: idx,
+        expected: w / 2,
+        actual: row.u_half().len(),
+      });
+    }
+    if row.v_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::VHalf14,
+        row: idx,
+        expected: w / 2,
+        actual: row.v_half().len(),
+      });
+    }
+    if idx >= self.height {
+      return Err(MixedSinkerError::RowIndexOutOfRange {
+        row: idx,
+        configured_height: self.height,
+      });
+    }
+
+    let Self {
+      rgb,
+      rgb_u16,
+      luma,
+      hsv,
+      rgb_scratch,
+      ..
+    } = self;
+
+    let one_plane_start = idx * w;
+    let one_plane_end = one_plane_start + w;
+
+    if let Some(luma) = luma.as_deref_mut() {
+      let dst = &mut luma[one_plane_start..one_plane_end];
+      for (d, &s) in dst.iter_mut().zip(row.y().iter()) {
+        *d = (s >> (BITS - 8)) as u8;
+      }
+    }
+
+    if let Some(buf) = rgb_u16.as_deref_mut() {
+      let rgb_plane_end =
+        one_plane_end
+          .checked_mul(3)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 3,
+          })?;
+      let rgb_plane_start = one_plane_start * 3;
+      yuv420p14_to_rgb_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        &mut buf[rgb_plane_start..rgb_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
+    let want_rgb = rgb.is_some();
+    let want_hsv = hsv.is_some();
+    if !want_rgb && !want_hsv {
+      return Ok(());
+    }
+
+    let rgb_row: &mut [u8] = match rgb.as_deref_mut() {
+      Some(buf) => {
+        let rgb_plane_end =
+          one_plane_end
+            .checked_mul(3)
+            .ok_or(MixedSinkerError::GeometryOverflow {
+              width: w,
+              height: h,
+              channels: 3,
+            })?;
+        let rgb_plane_start = one_plane_start * 3;
+        &mut buf[rgb_plane_start..rgb_plane_end]
+      }
+      None => {
+        let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow {
+          width: w,
+          height: h,
+          channels: 3,
+        })?;
+        if rgb_scratch.len() < rgb_row_bytes {
+          rgb_scratch.resize(rgb_row_bytes, 0);
+        }
+        &mut rgb_scratch[..rgb_row_bytes]
+      }
+    };
+
+    yuv420p14_to_rgb_row(
+      row.y(),
+      row.u_half(),
+      row.v_half(),
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+    );
+
+    if let Some(hsv) = hsv.as_mut() {
+      rgb_to_hsv_row(
+        rgb_row,
+        &mut hsv.h[one_plane_start..one_plane_end],
+        &mut hsv.s[one_plane_start..one_plane_end],
+        &mut hsv.v[one_plane_start..one_plane_end],
+        w,
+        use_simd,
+      );
+    }
+    Ok(())
+  }
+}
+
+// ---- P012 impl ---------------------------------------------------------
+
+impl<'a> MixedSinker<'a, P012> {
+  /// Attaches a packed **`u16`** RGB output buffer. Produces 12‑bit
+  /// output in **low‑bit‑packed** `yuv420p12le` convention (values in
+  /// `[0, 4095]` in the low 12 of each `u16`, upper 4 zero) —
+  /// **not** P012's high‑bit packing. Callers feeding a P012 consumer
+  /// must shift the output left by 4.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgb_u16(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected_elements = self.frame_bytes(3)?;
+    if buf.len() < expected_elements {
+      return Err(MixedSinkerError::RgbU16BufferTooShort {
+        expected: expected_elements,
+        actual: buf.len(),
+      });
+    }
+    self.rgb_u16 = Some(buf);
+    Ok(self)
+  }
+}
+
+impl P012Sink for MixedSinker<'_, P012> {}
+
+impl PixelSink for MixedSinker<'_, P012> {
+  type Input<'r> = P012Row<'r>;
+  type Error = MixedSinkerError;
+
+  fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> {
+    if self.width & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: self.width });
+    }
+    check_dimensions_match(self.width, self.height, width, height)
+  }
+
+  fn process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error> {
+    let w = self.width;
+    let h = self.height;
+    let idx = row.row();
+    let use_simd = self.simd;
+
+    if w & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: w });
+    }
+    if row.y().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::Y12,
+        row: idx,
+        expected: w,
+        actual: row.y().len(),
+      });
+    }
+    if row.uv_half().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::UvHalf12,
+        row: idx,
+        expected: w,
+        actual: row.uv_half().len(),
+      });
+    }
+    if idx >= self.height {
+      return Err(MixedSinkerError::RowIndexOutOfRange {
+        row: idx,
+        configured_height: self.height,
+      });
+    }
+
+    let Self {
+      rgb,
+      rgb_u16,
+      luma,
+      hsv,
+      rgb_scratch,
+      ..
+    } = self;
+
+    let one_plane_start = idx * w;
+    let one_plane_end = one_plane_start + w;
+
+    // Luma: P012 samples are high‑bit‑packed (`value << 4`). Taking
+    // the high byte via `>> 8` gives the top 8 bits of the 12‑bit
+    // value — identical accessor to P010 (both put active bits in the
+    // high `BITS` positions of the `u16`).
+    if let Some(luma) = luma.as_deref_mut() {
+      let dst = &mut luma[one_plane_start..one_plane_end];
+      for (d, &s) in dst.iter_mut().zip(row.y().iter()) {
+        *d = (s >> 8) as u8;
+      }
+    }
+
+    if let Some(buf) = rgb_u16.as_deref_mut() {
+      let rgb_plane_end =
+        one_plane_end
+          .checked_mul(3)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 3,
+          })?;
+      let rgb_plane_start = one_plane_start * 3;
+      p012_to_rgb_u16_row(
+        row.y(),
+        row.uv_half(),
+        &mut buf[rgb_plane_start..rgb_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
+    let want_rgb = rgb.is_some();
+    let want_hsv = hsv.is_some();
+    if !want_rgb && !want_hsv {
+      return Ok(());
+    }
+
+    let rgb_row: &mut [u8] = match rgb.as_deref_mut() {
+      Some(buf) => {
+        let rgb_plane_end =
+          one_plane_end
+            .checked_mul(3)
+            .ok_or(MixedSinkerError::GeometryOverflow {
+              width: w,
+              height: h,
+              channels: 3,
+            })?;
+        let rgb_plane_start = one_plane_start * 3;
+        &mut buf[rgb_plane_start..rgb_plane_end]
+      }
+      None => {
+        let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow {
+          width: w,
+          height: h,
+          channels: 3,
+        })?;
+        if rgb_scratch.len() < rgb_row_bytes {
+          rgb_scratch.resize(rgb_row_bytes, 0);
+        }
+        &mut rgb_scratch[..rgb_row_bytes]
+      }
+    };
+
+    p012_to_rgb_row(
+      row.y(),
+      row.uv_half(),
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+    );
+
+    if let Some(hsv) = hsv.as_mut() {
+      rgb_to_hsv_row(
+        rgb_row,
+        &mut hsv.h[one_plane_start..one_plane_end],
+        &mut hsv.s[one_plane_start..one_plane_end],
+        &mut hsv.v[one_plane_start..one_plane_end],
+        w,
+        use_simd,
+      );
+    }
+    Ok(())
+  }
+}
+
 /// Returns `Ok(())` iff the walker's frame dimensions exactly match
 /// the sinker's configured dimensions. Called from
 /// [`PixelSink::begin_frame`] on both `MixedSinker<Yuv420p>` and
diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs
index eedc2ab..b3f1f4c 100644
--- a/src/yuv/mod.rs
+++ b/src/yuv/mod.rs
@@ -10,20 +10,33 @@
 //!   chroma (Android MediaCodec default).
 //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits
 //!   per sample (HDR10 / 10‑bit SDR software decode).
+//! - [`Yuv420p12`](crate::yuv::Yuv420p12) — 4:2:0 planar at 12 bits
+//!   per sample (HEVC Main 12 / VP9 Profile 3 software decode).
+//! - [`Yuv420p14`](crate::yuv::Yuv420p14) — 4:2:0 planar at 14 bits
+//!   per sample (grading / mastering pipelines).
 //! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per
 //!   sample, high‑bit‑packed (HDR hardware decode: VideoToolbox,
 //!   VA‑API, NVDEC, D3D11VA, Intel QSV).
+//! - [`P012`](crate::yuv::P012) — 4:2:0 semi‑planar at 12 bits per
+//!   sample, high‑bit‑packed (HEVC Main 12 / VP9 Profile 3 hardware
+//!   decode).
 //!
 //! Other families land in follow-up commits.
 
 mod nv12;
 mod nv21;
 mod p010;
+mod p012;
 mod yuv420p;
 mod yuv420p10;
+mod yuv420p12;
+mod yuv420p14;
 
 pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to};
 pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to};
 pub use p010::{P010, P010Row, P010Sink, p010_to};
+pub use p012::{P012, P012Row, P012Sink, p012_to};
 pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to};
 pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to};
+pub use yuv420p12::{Yuv420p12, Yuv420p12Row, Yuv420p12Sink, yuv420p12_to};
+pub use yuv420p14::{Yuv420p14, Yuv420p14Row, Yuv420p14Sink, yuv420p14_to};
diff --git a/src/yuv/p012.rs b/src/yuv/p012.rs
new file mode 100644
index 0000000..b7b058e
--- /dev/null
+++ b/src/yuv/p012.rs
@@ -0,0 +1,152 @@
+//! P012 — semi‑planar 4:2:0, 12‑bit, high‑bit‑packed
+//! (`AV_PIX_FMT_P012LE`).
+//!
+//! Storage is a 2‑plane layout identical to [`super::P010`]: one full‑
+//! size Y plane plus one interleaved UV plane at half width and half
+//! height. Sample width is `u16` with the 12 active bits in the
+//! **high** 12 positions of each element (`sample = value << 4`), low
+//! 4 bits zero. This is the 12‑bit sibling of Microsoft's P010
+//! convention and what HEVC Main 12 / VP9 Profile 3 hardware decoders
+//! emit.
+//!
+//! Conversion semantics mirror [`super::P010`] on the layout side and
+//! [`super::Yuv420p12`] on the Q‑math side: two consecutive Y rows
+//! share one UV row (4:2:0), chroma is nearest‑neighbor upsampled in
+//! registers inside the row primitive, and every SIMD backend shifts
+//! each `u16` load right by 4 (= `16 - BITS` with `BITS == 12`) to
+//! extract the 12‑bit value before running the same Q15 pipeline used
+//! by [`super::P010`].
+
+use crate::{ColorMatrix, PixelSink, SourceFormat, frame::P012Frame, sealed::Sealed};
+
+/// Zero‑sized marker for the P012 source format. Used as the `F` type
+/// parameter on [`crate::sinker::MixedSinker`].
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct P012;
+
+impl Sealed for P012 {}
+impl SourceFormat for P012 {}
+
+/// One output row of a P012 source handed to a [`P012Sink`].
+///
+/// Accessors:
+/// - [`y`](Self::y) — full‑width Y row (`width` `u16` samples, high‑
+///   bit‑packed).
+/// - [`uv_half`](Self::uv_half) — **interleaved, half‑width** UV row
+///   (`width` `u16` elements = `width / 2` U/V pairs, U first). The
+///   row primitive deinterleaves and upsamples in‑register.
+/// - [`row`](Self::row) — output row index (`0 ..= frame.height() - 1`).
+/// - [`matrix`](Self::matrix), [`full_range`](Self::full_range) —
+///   carried through from the kernel call.
+#[derive(Debug, Clone, Copy)]
+pub struct P012Row<'a> {
+  y: &'a [u16],
+  uv_half: &'a [u16],
+  row: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+}
+
+impl<'a> P012Row<'a> {
+  /// Bundles one row of a P012 source for a [`P012Sink`].
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub(crate) fn new(
+    y: &'a [u16],
+    uv_half: &'a [u16],
+    row: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) -> Self {
+    Self {
+      y,
+      uv_half,
+      row,
+      matrix,
+      full_range,
+    }
+  }
+
+  /// Full‑width Y (luma) row — `width` `u16` samples, high‑bit‑packed
+  /// (12 active bits in the high 12 of each element).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Interleaved UV row — `width` `u16` elements laid out as
+  /// `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. Each element is
+  /// high‑bit‑packed.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn uv_half(&self) -> &'a [u16] {
+    self.uv_half
+  }
+
+  /// Output row index within the frame.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn row(&self) -> usize {
+    self.row
+  }
+
+  /// YUV → RGB matrix carried through from the kernel call.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn matrix(&self) -> ColorMatrix {
+    self.matrix
+  }
+
+  /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit,
+  /// scaled into the high 12 bits of each `u16`); `false` for limited
+  /// range.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn full_range(&self) -> bool {
+    self.full_range
+  }
+}
+
+/// Sinks that consume P012 rows.
+///
+/// A subtrait of [`PixelSink`] that pins the row shape to
+/// [`P012Row`]. Implementors get
+/// `process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error>`
+/// via the supertrait.
+pub trait P012Sink: for<'a> PixelSink<Input<'a> = P012Row<'a>> {}
+
+/// Converts a P012 frame by walking its rows and feeding each one to
+/// the [`P012Sink`].
+///
+/// The kernel is a pure row walker — no color arithmetic happens
+/// here. Slice math picks the Y row and the correct UV row for each
+/// output row (`chroma_row = row / 2` for 4:2:0) and hands borrows to
+/// the Sink. The Sink decides what to derive and where to write.
+pub fn p012_to<S: P012Sink>(
+  src: &P012Frame<'_>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  sink.begin_frame(src.width(), src.height())?;
+
+  let w = src.width() as usize;
+  let h = src.height() as usize;
+  let y_stride = src.y_stride() as usize;
+  let uv_stride = src.uv_stride() as usize;
+  // UV row payload is `width` `u16` elements — `width / 2` interleaved
+  // U/V pairs.
+  let uv_row_elems = w;
+
+  let y_plane = src.y();
+  let uv_plane = src.uv();
+
+  for row in 0..h {
+    let y_start = row * y_stride;
+    let y = &y_plane[y_start..y_start + w];
+
+    // 4:2:0 chroma subsampling: two consecutive Y rows share one UV
+    // row.
+    let chroma_row = row / 2;
+    let uv_start = chroma_row * uv_stride;
+    let uv_half = &uv_plane[uv_start..uv_start + uv_row_elems];
+
+    sink.process(P012Row::new(y, uv_half, row, matrix, full_range))?;
+  }
+  Ok(())
+}
diff --git a/src/yuv/yuv420p12.rs b/src/yuv/yuv420p12.rs
new file mode 100644
index 0000000..5995c6c
--- /dev/null
+++ b/src/yuv/yuv420p12.rs
@@ -0,0 +1,161 @@
+//! YUV 4:2:0 planar 12‑bit (`AV_PIX_FMT_YUV420P12LE`).
+//!
+//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size
+//! plus U / V at half width and half height — with **`u16`** samples
+//! (12 active bits in the **low** 12 of each element, upper 4 zero).
+//! The [`Yuv420p12Frame`] type alias pins the bit depth; the underlying
+//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same
+//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` runs
+//! unchanged against the 12‑bit instantiation.
+//!
+//! Ships in colconv v0.2a alongside [`super::Yuv420p14`] and
+//! [`super::P012`]. Kernel semantics match [`super::Yuv420p10`]: two
+//! consecutive Y rows share one chroma row (4:2:0), chroma is
+//! nearest‑neighbor upsampled in registers inside the row primitive,
+//! and Q15 intermediates stay in i32 (chroma_sum < 10⁹ < i32 max at 12
+//! bits — verified against the scalar reference per SIMD backend).
+
+use crate::{
+  ColorMatrix, PixelSink, SourceFormat,
+  frame::{Yuv420p12Frame, Yuv420pFrame16},
+  sealed::Sealed,
+};
+
+/// Zero‑sized marker for the YUV 4:2:0 **12‑bit** source format. Used
+/// as the `F` type parameter on [`crate::sinker::MixedSinker`].
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct Yuv420p12;
+
+impl Sealed for Yuv420p12 {}
+impl SourceFormat for Yuv420p12 {}
+
+/// One output row of a 12‑bit YUV 4:2:0 source handed to a
+/// [`Yuv420p12Sink`]. Structurally identical to [`super::Yuv420p10Row`],
+/// just with values in `[0, 4095]` instead of `[0, 1023]`.
+#[derive(Debug, Clone, Copy)]
+pub struct Yuv420p12Row<'a> {
+  y: &'a [u16],
+  u_half: &'a [u16],
+  v_half: &'a [u16],
+  row: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+}
+
+impl<'a> Yuv420p12Row<'a> {
+  /// Bundles one row of a 12‑bit 4:2:0 source for a [`Yuv420p12Sink`].
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub(crate) fn new(
+    y: &'a [u16],
+    u_half: &'a [u16],
+    v_half: &'a [u16],
+    row: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) -> Self {
+    Self {
+      y,
+      u_half,
+      v_half,
+      row,
+      matrix,
+      full_range,
+    }
+  }
+
+  /// Full‑width Y (luma) row — `width` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Half‑width U (Cb) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn u_half(&self) -> &'a [u16] {
+    self.u_half
+  }
+
+  /// Half‑width V (Cr) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn v_half(&self) -> &'a [u16] {
+    self.v_half
+  }
+
+  /// Output row index within the frame.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn row(&self) -> usize {
+    self.row
+  }
+
+  /// YUV → RGB matrix carried through from the kernel call.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn matrix(&self) -> ColorMatrix {
+    self.matrix
+  }
+
+  /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit);
+  /// `false` for limited range (`[256, 3760]` luma, `[256, 3840]`
+  /// chroma — the 8‑bit `[16, 235]` / `[16, 240]` ranges scaled by 16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn full_range(&self) -> bool {
+    self.full_range
+  }
+}
+
+/// Sinks that consume 12‑bit YUV 4:2:0 rows.
+pub trait Yuv420p12Sink: for<'a> PixelSink<Input<'a> = Yuv420p12Row<'a>> {}
+
+/// Converts a 12‑bit YUV 4:2:0 frame by walking its rows and feeding
+/// each one to the [`Yuv420p12Sink`]. Mirrors [`super::yuv420p10_to`] —
+/// pure row walker, all color arithmetic happens inside the Sink via
+/// the crate's row primitives instantiated at `BITS == 12`.
+pub fn yuv420p12_to<S: Yuv420p12Sink>(
+  src: &Yuv420p12Frame<'_>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  yuv420p12_walker::<12, S>(src, full_range, matrix, sink)
+}
+
+/// Row walker for the 12‑bit YUV 4:2:0 source. `BITS` is a const
+/// generic so [`Yuv420pFrame16<BITS>`] geometry reads (stride, plane
+/// slicing) are monomorphized; the row/sink types bound below are
+/// still pinned to the 12‑bit variants.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn yuv420p12_walker<const BITS: u32, S: Yuv420p12Sink>(
+  src: &Yuv420pFrame16<'_, BITS>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  sink.begin_frame(src.width(), src.height())?;
+
+  let w = src.width() as usize;
+  let h = src.height() as usize;
+  let y_stride = src.y_stride() as usize;
+  let u_stride = src.u_stride() as usize;
+  let v_stride = src.v_stride() as usize;
+  let chroma_width = w / 2;
+
+  let y_plane = src.y();
+  let u_plane = src.u();
+  let v_plane = src.v();
+
+  for row in 0..h {
+    let y_start = row * y_stride;
+    let y = &y_plane[y_start..y_start + w];
+
+    let chroma_row = row / 2;
+    let u_start = chroma_row * u_stride;
+    let v_start = chroma_row * v_stride;
+    let u_half = &u_plane[u_start..u_start + chroma_width];
+    let v_half = &v_plane[v_start..v_start + chroma_width];
+
+    sink.process(Yuv420p12Row::new(
+      y, u_half, v_half, row, matrix, full_range,
+    ))?;
+  }
+  Ok(())
+}
diff --git a/src/yuv/yuv420p14.rs b/src/yuv/yuv420p14.rs
new file mode 100644
index 0000000..27c54ee
--- /dev/null
+++ b/src/yuv/yuv420p14.rs
@@ -0,0 +1,159 @@
+//! YUV 4:2:0 planar 14‑bit (`AV_PIX_FMT_YUV420P14LE`).
+//!
+//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size
+//! plus U / V at half width and half height — with **`u16`** samples
+//! (14 active bits in the **low** 14 of each element, upper 2 zero).
+//! The [`Yuv420p14Frame`] type alias pins the bit depth; the underlying
+//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same
+//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` /
+//! `Yuv420p12` runs unchanged against the 14‑bit instantiation.
+//!
+//! Kernel math constraint: at 14 bits, chroma_sum still fits in i32
+//! (~10⁹ ≤ 2³¹), so the Q15 pipeline stays unchanged. 16‑bit would
+//! overflow and needs a separate kernel family.
+
+use crate::{
+  ColorMatrix, PixelSink, SourceFormat,
+  frame::{Yuv420p14Frame, Yuv420pFrame16},
+  sealed::Sealed,
+};
+
+/// Zero‑sized marker for the YUV 4:2:0 **14‑bit** source format. Used
+/// as the `F` type parameter on [`crate::sinker::MixedSinker`].
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct Yuv420p14;
+
+impl Sealed for Yuv420p14 {}
+impl SourceFormat for Yuv420p14 {}
+
+/// One output row of a 14‑bit YUV 4:2:0 source handed to a
+/// [`Yuv420p14Sink`]. Structurally identical to [`super::Yuv420p10Row`],
+/// just with values in `[0, 16383]` instead of `[0, 1023]`.
+#[derive(Debug, Clone, Copy)]
+pub struct Yuv420p14Row<'a> {
+  y: &'a [u16],
+  u_half: &'a [u16],
+  v_half: &'a [u16],
+  row: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+}
+
+impl<'a> Yuv420p14Row<'a> {
+  /// Bundles one row of a 14‑bit 4:2:0 source for a [`Yuv420p14Sink`].
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub(crate) fn new(
+    y: &'a [u16],
+    u_half: &'a [u16],
+    v_half: &'a [u16],
+    row: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) -> Self {
+    Self {
+      y,
+      u_half,
+      v_half,
+      row,
+      matrix,
+      full_range,
+    }
+  }
+
+  /// Full‑width Y (luma) row — `width` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Half‑width U (Cb) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn u_half(&self) -> &'a [u16] {
+    self.u_half
+  }
+
+  /// Half‑width V (Cr) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn v_half(&self) -> &'a [u16] {
+    self.v_half
+  }
+
+  /// Output row index within the frame.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn row(&self) -> usize {
+    self.row
+  }
+
+  /// YUV → RGB matrix carried through from the kernel call.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn matrix(&self) -> ColorMatrix {
+    self.matrix
+  }
+
+  /// `true` iff Y uses the full sample range (`[0, 16383]` for
+  /// 14‑bit); `false` for limited range (`[1024, 15040]` luma,
+  /// `[1024, 15360]` chroma — the 8‑bit `[16, 235]` / `[16, 240]`
+  /// ranges scaled by 64).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn full_range(&self) -> bool {
+    self.full_range
+  }
+}
+
+/// Sinks that consume 14‑bit YUV 4:2:0 rows.
+pub trait Yuv420p14Sink: for<'a> PixelSink<Input<'a> = Yuv420p14Row<'a>> {}
+
+/// Converts a 14‑bit YUV 4:2:0 frame by walking its rows and feeding
+/// each one to the [`Yuv420p14Sink`]. Mirrors [`super::yuv420p10_to`] —
+/// pure row walker, all color arithmetic happens inside the Sink via
+/// the crate's row primitives instantiated at `BITS == 14`.
+pub fn yuv420p14_to<S: Yuv420p14Sink>(
+  src: &Yuv420p14Frame<'_>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  yuv420p14_walker::<14, S>(src, full_range, matrix, sink)
+}
+
+/// Row walker for the 14‑bit YUV 4:2:0 source. `BITS` is a const
+/// generic so [`Yuv420pFrame16<BITS>`] geometry reads (stride, plane
+/// slicing) are monomorphized; the row/sink types bound below are
+/// still pinned to the 14‑bit variants.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn yuv420p14_walker<const BITS: u32, S: Yuv420p14Sink>(
+  src: &Yuv420pFrame16<'_, BITS>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  sink.begin_frame(src.width(), src.height())?;
+
+  let w = src.width() as usize;
+  let h = src.height() as usize;
+  let y_stride = src.y_stride() as usize;
+  let u_stride = src.u_stride() as usize;
+  let v_stride = src.v_stride() as usize;
+  let chroma_width = w / 2;
+
+  let y_plane = src.y();
+  let u_plane = src.u();
+  let v_plane = src.v();
+
+  for row in 0..h {
+    let y_start = row * y_stride;
+    let y = &y_plane[y_start..y_start + w];
+
+    let chroma_row = row / 2;
+    let u_start = chroma_row * u_stride;
+    let v_start = chroma_row * v_stride;
+    let u_half = &u_plane[u_start..u_start + chroma_width];
+    let v_half = &v_plane[v_start..v_start + chroma_width];
+
+    sink.process(Yuv420p14Row::new(
+      y, u_half, v_half, row, matrix, full_range,
+    ))?;
+  }
+  Ok(())
+}

From d044da1341bfc314ae17268e2d26422fc04902fe Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 19 Apr 2026 23:55:39 +1200
Subject: [PATCH 4/4] more simd backend

---
 src/frame.rs                 | 172 +++++++++----
 src/lib.rs                   |  47 +++-
 src/row/arch/neon.rs         |  17 +-
 src/row/arch/wasm_simd128.rs |  22 +-
 src/row/arch/x86_avx2.rs     |  22 +-
 src/row/arch/x86_avx512.rs   |  22 +-
 src/row/arch/x86_sse41.rs    |  21 +-
 src/row/scalar.rs            |  27 ++
 src/sinker/mixed.rs          | 474 ++++++++++++++++++++++++++++++++++-
 src/sinker/mod.rs            |   9 +-
 src/yuv/mod.rs               |  27 +-
 src/yuv/yuv420p10.rs         |  23 +-
 12 files changed, 770 insertions(+), 113 deletions(-)

diff --git a/src/frame.rs b/src/frame.rs
index 2c1997b..a56353f 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -496,28 +496,28 @@ pub enum Nv12FrameError {
 ///
 /// # Input sample range and packing sanity
 ///
-/// Each `u16` sample's 10 active bits live in the high 10 positions;
-/// the low 6 bits are expected to be zero. [`Self::try_new`] validates
-/// geometry only.
+/// Each `u16` sample's `BITS` active bits live in the high `BITS`
+/// positions; the low `16 - BITS` bits are expected to be zero.
+/// [`Self::try_new`] validates geometry only.
 ///
 /// [`Self::try_new_checked`] additionally scans every sample and
-/// rejects any with non‑zero low 6 bits — a **necessary but not
-/// sufficient** packing sanity check. It catches mispacked
-/// `yuv420p10le` buffers as long as **at least one** sample has
-/// low‑bit content (the usual case for noisy real‑world image data),
-/// but it **cannot distinguish** P010 from a `yuv420p10le` buffer
-/// whose samples all happen to be multiples of 64. Values like
-/// `Y = 64` (limited‑range black) and `UV = 512` (neutral chroma)
-/// both have low 6 bits zero and so pass the check, even though the
-/// buffer layout is wrong. For strict provenance, callers must rely
-/// on their source format metadata and pick the right frame type
-/// ([`P010Frame`] vs [`Yuv420p10Frame`]) at construction.
+/// rejects any with non‑zero low `16 - BITS` bits — a **necessary
+/// but not sufficient** packing sanity check. Its catch rate
+/// weakens as `BITS` grows: at `BITS == 10` it rejects 63/64 random
+/// samples and is a strong signal; at `BITS == 12` it only rejects
+/// 15/16, and **common flat‑region values in decoder output are
+/// exactly the ones that slip through** (`Y = 256/1024` limited
+/// black, `UV = 2048` neutral chroma are all multiples of 16 in
+/// both layouts). See [`Self::try_new_checked`] for the full
+/// table. For strict provenance, callers must rely on their source
+/// format metadata and pick the right frame type ([`PnFrame`] vs
+/// [`Yuv420pFrame16`]) at construction.
 ///
-/// Kernels shift each load right by 6 to extract the 10‑bit value,
-/// so mispacked input (e.g. a `yuv420p10le` buffer handed to the
-/// P010 kernel) produces deterministic, backend‑independent output
-/// — wrong colors, but consistently wrong across scalar + every
-/// SIMD backend, which is visible in any output diff.
+/// Kernels shift each load right by `16 - BITS` to extract the
+/// active value, so mispacked input (e.g. a `yuv420p12le` buffer
+/// handed to the P012 kernel) produces deterministic, backend‑
+/// independent output — wrong colors, but consistently wrong across
+/// scalar + every SIMD backend, which is visible in any output diff.
 #[derive(Debug, Clone, Copy)]
 pub struct PnFrame<'a, const BITS: u32> {
   y: &'a [u16],
@@ -633,29 +633,44 @@ impl<'a, const BITS: u32> PnFrame<'a, BITS> {
   }
 
   /// Like [`Self::try_new`] but additionally scans every sample and
-  /// rejects any whose **low 6 bits** are non‑zero. A valid P010
-  /// sample has its 10 active bits in the high 10 positions and zero
-  /// below, so non‑zero low bits is evidence the buffer isn't P010.
+  /// rejects any whose **low `16 - BITS` bits** are non‑zero. A valid
+  /// high‑bit‑packed sample has its `BITS` active bits in the high
+  /// `BITS` positions and zero below, so non‑zero low bits is
+  /// evidence the buffer isn't Pn‑shaped.
   ///
   /// **This is a packing sanity check, not a provenance validator.**
-  /// The check catches noisy `yuv420p10le` data (where most samples
-  /// have low‑bit content), but it **cannot** distinguish P010 from
-  /// a `yuv420p10le` buffer whose samples all happen to be multiples
-  /// of 64. Common flat‑region values like `Y = 64` (limited‑range
-  /// black) or `UV = 512` (neutral chroma) are multiples of 64 in
-  /// both layouts, so a yuv420p10le buffer of flat content will
-  /// silently pass this check. Callers who need strict provenance
-  /// must rely on their source format metadata and pick the right
-  /// frame type at construction ([`P010Frame`] vs [`Yuv420p10Frame`]);
-  /// no runtime check on opaque `u16` data can reliably tell the two
-  /// layouts apart.
+  /// The check catches noisy low‑bit‑packed data (where most samples
+  /// have low‑bit content), but it **cannot** distinguish Pn from a
+  /// low‑bit‑packed buffer whose samples all happen to be multiples
+  /// of `1 << (16 - BITS)`. The catch rate scales with `BITS`:
+  ///
+  /// - `BITS == 10` (P010): 6 low bits must be zero. Random u16
+  ///   samples pass with probability `1/64`; noisy `yuv420p10le`
+  ///   data is almost always caught.
+  /// - `BITS == 12` (P012): only 4 low bits. Pass probability is
+  ///   `1/16` — 4× weaker. **Common limited‑range flat‑region values
+  ///   (`Y = 256` limited black, `UV = 2048` neutral chroma,
+  ///   `Y = 1024` full black) are all multiples of 16 in both
+  ///   layouts**, so flat `yuv420p12le` content passes **every
+  ///   time**. The `>> 4` extraction in the Pn kernels then
+  ///   discards the real signal and produces badly darkened
+  ///   output. For P012, prefer format metadata over this check.
+  ///
+  /// Callers who need strict provenance must rely on their source
+  /// format metadata and pick the right frame type at construction
+  /// ([`PnFrame`] vs [`Yuv420pFrame16`]); no runtime check on opaque
+  /// `u16` data can reliably tell the two layouts apart, and the
+  /// weakness is proportionally worse the higher the `BITS` value.
+  /// The regression test
+  /// `p012_try_new_checked_accepts_low_packed_flat_content_by_design`
+  /// in `frame::tests` pins this limitation in code.
   ///
   /// Cost: one O(plane_size) scan per plane. The default
   /// [`Self::try_new`] skips this so the hot path stays O(1).
   ///
   /// Returns [`PnFrameError::SampleLowBitsSet`] on the first
-  /// offending sample — carries the plane, element index, and
-  /// offending value.
+  /// offending sample — carries the plane, element index, offending
+  /// value, and the number of low bits expected to be zero.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn try_new_checked(
     y: &'a [u16],
@@ -1138,11 +1153,13 @@ pub enum Nv21FrameError {
 /// [`Self::try_new_checked`] — it scans every sample and returns
 /// [`Yuv420pFrame16Error::SampleOutOfRange`] on the first violation.
 ///
-/// colconv v0.2 ships `BITS == 10` only (the use‑case keystone for
-/// HDR and 10‑bit SDR). 12 and 14 are mechanical follow‑ups that
-/// just relax the constructor's `BITS` check and add tiered aliases
-/// — the kernel math (Q15 coefficients + i32 intermediates) works
-/// unchanged across all three, derived at compile time from `BITS`.
+/// All three supported depths — `BITS == 10` (HDR10 / 10‑bit SDR
+/// keystone), `BITS == 12` (HEVC Main 12 / VP9 Profile 3), and
+/// `BITS == 14` (grading / mastering pipelines) — share the same
+/// scalar + SIMD kernel family. The Q15 coefficients + i32
+/// intermediates work unchanged across all three, derived at
+/// compile time from `BITS`; the constructor validates the `BITS`
+/// value against the set `{10, 12, 14}` up front.
 ///
 /// 16‑bit input (which would overflow the i32 chroma sum in the
 /// Q15 path) is **not** represented by this type — it needs a
@@ -1174,8 +1191,9 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> {
   /// lengths, and the `BITS` parameter.
   ///
   /// Returns [`Yuv420pFrame16Error`] if any of:
-  /// - `BITS` is not 10, 12, or 14 (colconv v0.2 additionally rejects
-  ///   12/14 at the type alias layer — see [`Yuv420p10Frame`]),
+  /// - `BITS` is not 10, 12, or 14 — use [`Yuv420p10Frame`],
+  ///   [`Yuv420p12Frame`], or [`Yuv420p14Frame`] at call sites for
+  ///   readability, all three are type aliases over this struct,
   /// - `width` or `height` is zero,
   /// - `width` is odd,
   /// - any stride is smaller than the plane's declared pixel width,
@@ -2358,4 +2376,74 @@ mod tests {
     // source values). That's accepted behavior — the type system,
     // not `try_new_checked`, is what keeps yuv420p10le out of P010.
   }
+
+  #[test]
+  fn p012_try_new_checked_accepts_shifted_samples() {
+    // Valid P012 samples: low 4 bits zero (12-bit value << 4).
+    let y = std::vec![(2048u16) << 4; 16 * 8]; // 12-bit mid-gray shifted up
+    let uv = std::vec![(2048u16) << 4; 16 * 4];
+    P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).expect("shifted samples valid");
+  }
+
+  #[test]
+  fn p012_try_new_checked_rejects_low_bits_set() {
+    // A Y sample with any of the low 4 bits set — e.g. yuv420p12le
+    // value 0x0ABC landing where P012 expects `value << 4`. The check
+    // catches samples like this that are obviously mispacked.
+    let mut y = std::vec![(2048u16) << 4; 16 * 8];
+    y[3 * 16 + 5] = 0x0ABC; // low 4 bits = 0xC ≠ 0
+    let uv = std::vec![(2048u16) << 4; 16 * 4];
+    let e = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    match e {
+      PnFrameError::SampleLowBitsSet {
+        plane,
+        value,
+        low_bits,
+        ..
+      } => {
+        assert_eq!(plane, PnFramePlane::Y);
+        assert_eq!(value, 0x0ABC);
+        assert_eq!(low_bits, 4);
+      }
+      other => panic!("expected SampleLowBitsSet, got {other:?}"),
+    }
+  }
+
+  /// Regression documenting a **worse known limitation** of
+  /// [`P012Frame::try_new_checked`] compared to P010: because the
+  /// low‑bits check only has 4 bits to work with at `BITS == 12`,
+  /// every multiple‑of‑16 `yuv420p12le` value passes silently. The
+  /// practical impact is that common limited‑range flat‑region
+  /// content in real decoder output — `Y = 256` (limited‑range
+  /// black), `UV = 2048` (neutral chroma), `Y = 1024` (full black)
+  /// — is entirely invisible to this check.
+  ///
+  /// This test pins the limitation with a reproducible input so
+  /// that:
+  /// 1. Users reading the test suite can see the exact failure
+  ///    mode for `try_new_checked` on 12‑bit data.
+  /// 2. Any future attempt to strengthen `try_new_checked` (e.g.,
+  ///    into a statistical provenance heuristic) has a concrete
+  ///    input to validate against.
+  /// 3. The `PnFrame` docs' warning about this limitation has a
+  ///    named test to point to.
+  ///
+  /// For P012, the type system (choosing [`P012Frame`] vs
+  /// [`Yuv420p12Frame`] at construction based on decoder metadata)
+  /// is the only reliable provenance guarantee.
+  #[test]
+  fn p012_try_new_checked_accepts_low_packed_flat_content_by_design() {
+    // All values are multiples of 16 — exactly the set that slips
+    // through a 4-low-bits-zero check. `yuv420p12le` limited-range
+    // black and neutral chroma both satisfy this.
+    let y = std::vec![0x0100u16; 16 * 8]; // Y = 256 (limited-range black), multiple of 16
+    let uv = std::vec![0x0800u16; 16 * 4]; // UV = 2048 (neutral chroma), multiple of 16
+    let f = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16)
+      .expect("known limitation: 4-low-bits-zero check cannot tell yuv420p12le from P012");
+    assert_eq!(f.width(), 16);
+    // Downstream P012 kernels would extract `>> 4` — giving Y=16 and
+    // UV=128 instead of the intended Y=256 and UV=2048. Silent color
+    // corruption. The type system, not `try_new_checked`, must
+    // guarantee provenance for 12-bit.
+  }
 }
diff --git a/src/lib.rs b/src/lib.rs
index ec97890..40b295d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,8 +24,46 @@
 //! [`with_hsv`](sinker::MixedSinker::with_hsv) to select which channels
 //! to derive.
 //!
-//! The crate design also follows a per-format expansion plan with
-//! defined implementation priority tiers for the conversion kernels.
+//! # Supported source formats
+//!
+//! Shipped (all 4:2:0 subsampling):
+//!
+//! | Family           | Bit depth | Packing                | FFmpeg name           |
+//! | ---------------- | --------- | ---------------------- | --------------------- |
+//! | [`Yuv420p`]      |  8        | planar                 | `yuv420p`             |
+//! | [`Nv12`]         |  8        | semi-planar UV         | `nv12`                |
+//! | [`Nv21`]         |  8        | semi-planar VU         | `nv21`                |
+//! | [`Yuv420p10`]    | 10        | planar, low-packed     | `yuv420p10le`         |
+//! | [`Yuv420p12`]    | 12        | planar, low-packed     | `yuv420p12le`         |
+//! | [`Yuv420p14`]    | 14        | planar, low-packed     | `yuv420p14le`         |
+//! | [`P010`]         | 10        | semi-planar, high-packed | `p010le`            |
+//! | [`P012`]         | 12        | semi-planar, high-packed | `p012le`            |
+//!
+//! Not yet shipped (follow-up):
+//!
+//! - **16‑bit families** (`Yuv420p16` / `P016`) — require a separate
+//!   kernel family because the Q15 chroma_sum overflows i32 at
+//!   `BITS == 16`. Current scalar / SIMD kernels `debug_assert!` out
+//!   `BITS == 16` precisely to surface this.
+//! - **4:2:2 and 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`,
+//!   `Nv42`) — share the Q15 math but need their own row walkers
+//!   for the different chroma subsampling / stride.
+//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`,
+//!   `Rgba1010102`, etc.).
+//!
+//! See [`yuv`] for the per-format module-level breakdown and
+//! [`frame`] for the validated frame types plus the `BITS` const
+//! generic on the high-bit-depth families (`Yuv420pFrame16<BITS>`
+//! and `PnFrame<BITS>`).
+//!
+//! [`Yuv420p`]: crate::yuv::Yuv420p
+//! [`Nv12`]: crate::yuv::Nv12
+//! [`Nv21`]: crate::yuv::Nv21
+//! [`Yuv420p10`]: crate::yuv::Yuv420p10
+//! [`Yuv420p12`]: crate::yuv::Yuv420p12
+//! [`Yuv420p14`]: crate::yuv::Yuv420p14
+//! [`P010`]: crate::yuv::P010
+//! [`P012`]: crate::yuv::P012
 
 #![cfg_attr(not(feature = "std"), no_std)]
 #![cfg_attr(docsrs, feature(doc_cfg))]
@@ -167,8 +205,9 @@ pub trait PixelSink {
   }
 
   /// Consume one input unit. Called by the kernel once per unit (one
-  /// row, for the row-granular kernels v0.1 ships). Input borrows may
-  /// be invalidated after the call returns — implementations must not
+  /// row, for the row-granular kernels currently shipped). Input
+  /// borrows may be invalidated after the call returns —
+  /// implementations must not
   /// retain them.
   ///
   /// Returns `Err` to short-circuit the walker: on the first `Err`,
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index f8ceba2..878d5e3 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -643,17 +643,20 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
 /// **native‑depth `u16`** RGB (low‑bit‑packed output,
 /// `yuv420p10le` / `yuv420p12le` convention — not P010/P012).
 ///
-/// Same structure as [`p010_to_rgb_row`] up to the chroma compute;
-/// the only differences are:
-/// - `range_params_n::<10, 10>` → larger scales targeting the 10‑bit
-///   output range.
-/// - Clamp is explicit min/max to `[0, 1023]` via
-///   [`clamp_u10`](crate::row::arch::neon::clamp_u10).
+/// Same structure as [`super::neon::p_n_to_rgb_row`] up to the
+/// chroma compute; the only differences are:
+/// - `range_params_n::<BITS, BITS>` → larger scales targeting the
+///   native‑depth output range.
+/// - Clamp is explicit min/max to `[0, (1 << BITS) - 1]` via
+///   [`clamp_u10`](crate::row::arch::neon::clamp_u10) — the helper
+///   name is historical; the actual max is derived from `BITS` at
+///   the call site (1023 for P010, 4095 for P012).
 /// - Writes use two `vst3q_u16` calls per 16‑pixel block.
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index f0619a9..2a54fbd 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -500,18 +500,22 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
   }
 }
 
-/// WASM simd128 P010 → packed **8‑bit** RGB.
+/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) →
+/// packed **8‑bit** RGB.
 ///
 /// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors
-/// [`yuv420p10_to_rgb_row`] with two structural differences:
-/// - Samples are shifted right by 6 (`u16x8_shr(_, 6)`) instead of
-///   AND‑masked.
+/// [`super::wasm_simd128::yuv_420p_n_to_rgb_row`] with two structural
+/// differences:
+/// - Samples are shifted right by `16 - BITS` (`u16x8_shr`, with
+///   the shift amount computed from `BITS` once per call) instead
+///   of AND‑masked.
 /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_wasm`]
 ///   (two `u8x16_swizzle` + two `i8x16_shuffle` combines).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
@@ -621,12 +625,14 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   }
 }
 
-/// WASM simd128 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
-/// `yuv420p10le` convention).
+/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) →
+/// packed **native‑depth `u16`** RGB (low‑bit‑packed output,
+/// `yuv420pNle` convention).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 16deb67..10258c2 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -527,19 +527,23 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i {
   unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) }
 }
 
-/// AVX2 P010 → packed **8‑bit** RGB.
+/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **8‑bit** RGB.
 ///
 /// Block size 32 Y pixels / 16 chroma pairs per iteration. Mirrors
-/// [`yuv420p10_to_rgb_row`] with two structural differences:
-/// - Samples are shifted right by 6 (`_mm256_srli_epi16::<6>`)
-///   instead of AND‑masked.
+/// [`super::x86_avx2::yuv_420p_n_to_rgb_row`] with two structural
+/// differences:
+/// - Samples are shifted right by `16 - BITS` (`_mm256_srl_epi16`,
+///   with a shift count computed from `BITS` once per call) instead
+///   of AND‑masked.
 /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx2`]
 ///   (two `_mm256_shuffle_epi8` + two `_mm256_permute4x64_epi64` +
 ///   two `_mm256_permute2x128_si256` per 32 chroma elements).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
@@ -660,12 +664,14 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   }
 }
 
-/// AVX2 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
-/// `yuv420p10le` convention).
+/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle`
+/// convention).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 238a09a..3925276 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -572,12 +572,15 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
   }
 }
 
-/// AVX‑512 P010 → packed **8‑bit** RGB.
+/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **8‑bit** RGB.
 ///
 /// Block size 64 Y pixels / 32 chroma pairs per iteration. Mirrors
-/// [`yuv420p10_to_rgb_row`] with two structural differences:
-/// - Samples are shifted right by 6 (`_mm512_srli_epi16::<6>`)
-///   instead of AND‑masked.
+/// [`super::x86_avx512::yuv_420p_n_to_rgb_row`] with two structural
+/// differences:
+/// - Samples are shifted right by `16 - BITS` (`_mm512_srl_epi16`,
+///   with a shift count computed from `BITS` once per call) instead
+///   of AND‑masked.
 /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx512`]
 ///   — per‑128‑lane shuffle + 64‑bit permute + cross‑vector
 ///   `_mm512_permutex2var_epi64` to produce 32‑sample U and V
@@ -585,7 +588,8 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
@@ -707,12 +711,14 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   }
 }
 
-/// AVX‑512 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
-/// `yuv420p10le` convention).
+/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle`
+/// convention).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 1dd5f2d..75796bf 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -193,12 +193,14 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
-/// SSE4.1 P010 → packed **8‑bit** RGB.
+/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **8‑bit** RGB.
 ///
 /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences
-/// from [`yuv420p10_to_rgb_row`]:
-/// - Samples are shifted right by 6 (`_mm_srli_epi16::<6>`) instead
-///   of AND‑masked — P010's 10 active bits live in the HIGH 10 of
+/// from [`super::x86_sse41::yuv_420p_n_to_rgb_row`]:
+/// - Samples are shifted right by `16 - BITS` (`_mm_srl_epi16`, with
+///   a shift count computed from `BITS` once per call) instead of
+///   AND‑masked — Pn's `BITS` active bits live in the HIGH `BITS` of
 ///   each `u16`.
 /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16`]
 ///   below (one `_mm_shuffle_epi8` + two 64‑bit unpacks per 16
@@ -206,7 +208,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
@@ -320,12 +323,14 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   }
 }
 
-/// SSE4.1 P010 → packed **10‑bit `u16`** RGB (native‑depth,
-/// low‑bit‑packed — `yuv420p10le` convention).
+/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed
+/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle`
+/// convention).
 ///
 /// # Numerical contract
 ///
-/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`].
+/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<BITS>`] for the
+/// monomorphized `BITS`.
 ///
 /// # Safety
 ///
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 26759c9..8d45b48 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -207,6 +207,13 @@ pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Low-bit-packed planar kernels are defined for BITS in {10, 12, 14}.
+  // 16 would overflow the Q15 chroma sum; 8 belongs to the non-
+  // const-generic `yuv_420_to_rgb_row` family.
+  debug_assert!(
+    BITS == 10 || BITS == 12 || BITS == 14,
+    "yuv_420p_n_to_rgb_row only supports BITS in {{10, 12, 14}}"
+  );
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
@@ -300,6 +307,12 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Same BITS range as the u8-output counterpart. See
+  // `yuv_420p_n_to_rgb_row` for the rationale.
+  debug_assert!(
+    BITS == 10 || BITS == 12 || BITS == 14,
+    "yuv_420p_n_to_rgb_u16_row only supports BITS in {{10, 12, 14}}"
+  );
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
@@ -373,6 +386,14 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // High-bit-packed Pn kernels are only defined for BITS in {10, 12}.
+  // Outside that set, `16 - BITS` could under/overflow and the Q15
+  // coefficient table has no corresponding entry. Caught here before
+  // the SIMD dispatcher hands control to unsafe code.
+  debug_assert!(
+    BITS == 10 || BITS == 12,
+    "p_n_to_rgb_row only supports BITS in {{10, 12}}"
+  );
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
@@ -443,6 +464,12 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // See `p_n_to_rgb_row` for the BITS range rationale. Duplicated
+  // here so either entry point catches misuse on its own.
+  debug_assert!(
+    BITS == 10 || BITS == 12,
+    "p_n_to_rgb_u16_row only supports BITS in {{10, 12}}"
+  );
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 5877255..4d210a7 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -2,9 +2,17 @@
 //! written into my own buffers" consumer.
 //!
 //! Generic over the source format via an `F: SourceFormat` type
-//! parameter. One `PixelSink` impl per supported format; v0.1 ships
-//! the [`Yuv420p`](crate::yuv::Yuv420p),
-//! [`Nv12`](crate::yuv::Nv12), and [`Nv21`](crate::yuv::Nv21) impls.
+//! parameter. One `PixelSink` impl per supported format. Currently
+//! ships impls for:
+//!
+//! - 8‑bit 4:2:0: [`Yuv420p`](crate::yuv::Yuv420p),
+//!   [`Nv12`](crate::yuv::Nv12), [`Nv21`](crate::yuv::Nv21).
+//! - 10/12/14‑bit planar 4:2:0: [`Yuv420p10`](crate::yuv::Yuv420p10),
+//!   [`Yuv420p12`](crate::yuv::Yuv420p12),
+//!   [`Yuv420p14`](crate::yuv::Yuv420p14).
+//! - 10/12‑bit semi‑planar high‑bit‑packed 4:2:0:
+//!   [`P010`](crate::yuv::P010), [`P012`](crate::yuv::P012).
+//!
 //! All configuration and processing methods are fallible — no panics
 //! under normal contract violations — so the sink is usable on
 //! `panic = "abort"` targets.
@@ -228,8 +236,12 @@ pub enum RowSlice {
   /// bits sit in the high 10 of its `u16`).
   #[display("UV Half 10")]
   UvHalf10,
-  /// Full‑width Y row of a **12‑bit** planar source ([`Yuv420p12`]).
-  /// `u16` samples, `width` elements, low‑bit‑packed.
+  /// Full‑width Y row of a **12‑bit** source — used for both the
+  /// planar ([`Yuv420p12`], low‑bit‑packed) and semi‑planar
+  /// ([`P012`], high‑bit‑packed) families. `u16` samples, `width`
+  /// elements. The packing direction depends on the source format;
+  /// the row‑shape check only verifies length, so a single variant
+  /// covers both.
   #[display("Y12")]
   Y12,
   /// Half‑width U row of a **12‑bit** planar source. `u16` samples,
@@ -276,10 +288,9 @@ pub enum RowSlice {
 /// # Type parameter
 ///
 /// `F` identifies the source format — `Yuv420p`, `Nv12`, `Nv21`,
-/// `Bgr24`, etc. Each format provides its own
-/// `impl PixelSink for MixedSinker<'_, F>`. v0.1 ships impls for
-/// [`Yuv420p`](crate::yuv::Yuv420p), [`Nv12`](crate::yuv::Nv12), and
-/// [`Nv21`](crate::yuv::Nv21).
+/// `Yuv420p10`, `Yuv420p12`, `Yuv420p14`, `P010`, `P012`, etc. Each
+/// format provides its own `impl PixelSink for MixedSinker<'_, F>`.
+/// See the module‑level docs for the full list of shipped impls.
 pub struct MixedSinker<'a, F: SourceFormat> {
   rgb: Option<&'a mut [u8]>,
   rgb_u16: Option<&'a mut [u16]>,
@@ -1905,8 +1916,13 @@ mod tests {
   use super::*;
   use crate::{
     ColorMatrix,
-    frame::{Nv12Frame, Nv21Frame, P010Frame, Yuv420p10Frame, Yuv420pFrame},
-    yuv::{nv12_to, nv21_to, p010_to, yuv420p_to, yuv420p10_to},
+    frame::{
+      Nv12Frame, Nv21Frame, P010Frame, P012Frame, Yuv420p10Frame, Yuv420p12Frame, Yuv420p14Frame,
+      Yuv420pFrame,
+    },
+    yuv::{
+      nv12_to, nv21_to, p010_to, p012_to, yuv420p_to, yuv420p10_to, yuv420p12_to, yuv420p14_to,
+    },
   };
 
   fn solid_yuv420p_frame(
@@ -3102,4 +3118,440 @@ mod tests {
     assert_eq!(rgb_scalar, rgb_simd);
     assert_eq!(rgb_u16_scalar, rgb_u16_simd);
   }
+
+  // ---- Yuv420p12 ---------------------------------------------------------
+  //
+  // Planar 12-bit, low-bit-packed. Mirrors the Yuv420p10 shape — same
+  // planar layout, wider sample range. `mid-gray` for 12-bit is
+  // Y=UV=2048; native-depth white (full-range) is 4095.
+
+  fn solid_yuv420p12_frame(
+    width: u32,
+    height: u32,
+    y: u16,
+    u: u16,
+    v: u16,
+  ) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
+    let w = width as usize;
+    let h = height as usize;
+    let cw = w / 2;
+    let ch = h / 2;
+    (
+      std::vec![y; w * h],
+      std::vec![u; cw * ch],
+      std::vec![v; cw * ch],
+    )
+  }
+
+  #[test]
+  fn yuv420p12_rgb_u8_only_gray_is_gray() {
+    let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u8; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_rgb(&mut rgb)
+      .unwrap();
+    yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+    }
+  }
+
+  #[test]
+  fn yuv420p12_rgb_u16_only_native_depth_gray() {
+    let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .unwrap();
+    yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      // Upper 4 bits must be zero — 12-bit low-packed convention.
+      assert!(px[0] <= 4095);
+    }
+  }
+
+  #[test]
+  fn yuv420p12_rgb_u8_and_u16_both_populated() {
+    // Full-range white: Y=4095, UV=2048.
+    let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 4095, 2048, 2048);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+    let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_rgb(&mut rgb_u8)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap();
+    yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(rgb_u8.iter().all(|&c| c == 255));
+    assert!(rgb_u16.iter().all(|&c| c == 4095));
+  }
+
+  #[test]
+  fn yuv420p12_luma_downshifts_to_8bit() {
+    // Y=2048 at 12 bits → 2048 >> (12 - 8) = 128 at 8 bits.
+    let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut luma = std::vec![0u8; 16 * 8];
+    let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_luma(&mut luma)
+      .unwrap();
+    yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(luma.iter().all(|&l| l == 128));
+  }
+
+  #[test]
+  fn yuv420p12_hsv_from_gray_is_zero_hue_zero_sat() {
+    let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut h = std::vec![0xFFu8; 16 * 8];
+    let mut s = std::vec![0xFFu8; 16 * 8];
+    let mut v = std::vec![0xFFu8; 16 * 8];
+    let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_hsv(&mut h, &mut s, &mut v)
+      .unwrap();
+    yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(h.iter().all(|&b| b == 0));
+    assert!(s.iter().all(|&b| b == 0));
+    assert!(v.iter().all(|&b| b.abs_diff(128) <= 1));
+  }
+
+  #[test]
+  fn yuv420p12_rgb_u16_too_short_returns_err() {
+    let mut rgb = std::vec![0u16; 10];
+    let err = MixedSinker::<Yuv420p12>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .err()
+      .unwrap();
+    assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. }));
+  }
+
+  #[test]
+  fn yuv420p12_with_simd_false_matches_with_simd_true() {
+    let (yp, up, vp) = solid_yuv420p12_frame(64, 16, 2400, 1600, 2800);
+    let src = Yuv420p12Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32);
+
+    let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3];
+    let mut s_scalar = MixedSinker::<Yuv420p12>::new(64, 16)
+      .with_simd(false)
+      .with_rgb(&mut rgb_scalar)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_scalar)
+      .unwrap();
+    yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap();
+
+    let mut rgb_simd = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3];
+    let mut s_simd = MixedSinker::<Yuv420p12>::new(64, 16)
+      .with_rgb(&mut rgb_simd)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_simd)
+      .unwrap();
+    yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap();
+
+    assert_eq!(rgb_scalar, rgb_simd);
+    assert_eq!(rgb_u16_scalar, rgb_u16_simd);
+  }
+
+  // ---- Yuv420p14 ---------------------------------------------------------
+
+  fn solid_yuv420p14_frame(
+    width: u32,
+    height: u32,
+    y: u16,
+    u: u16,
+    v: u16,
+  ) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
+    let w = width as usize;
+    let h = height as usize;
+    let cw = w / 2;
+    let ch = h / 2;
+    (
+      std::vec![y; w * h],
+      std::vec![u; cw * ch],
+      std::vec![v; cw * ch],
+    )
+  }
+
+  #[test]
+  fn yuv420p14_rgb_u8_only_gray_is_gray() {
+    // 14-bit mid-gray: Y=UV=8192.
+    let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192);
+    let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u8; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+      .with_rgb(&mut rgb)
+      .unwrap();
+    yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+    }
+  }
+
+  #[test]
+  fn yuv420p14_rgb_u16_only_native_depth_gray() {
+    let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192);
+    let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .unwrap();
+    yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(8192) <= 1, "got {px:?}");
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      assert!(px[0] <= 16383);
+    }
+  }
+
+  #[test]
+  fn yuv420p14_luma_downshifts_to_8bit() {
+    // Y=8192 at 14 bits → 8192 >> (14 - 8) = 128.
+    let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192);
+    let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut luma = std::vec![0u8; 16 * 8];
+    let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+      .with_luma(&mut luma)
+      .unwrap();
+    yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(luma.iter().all(|&l| l == 128));
+  }
+
+  #[test]
+  fn yuv420p14_rgb_u8_and_u16_both_populated() {
+    let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 16383, 8192, 8192);
+    let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+    let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+      .with_rgb(&mut rgb_u8)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap();
+    yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(rgb_u8.iter().all(|&c| c == 255));
+    assert!(rgb_u16.iter().all(|&c| c == 16383));
+  }
+
+  #[test]
+  fn yuv420p14_with_simd_false_matches_with_simd_true() {
+    let (yp, up, vp) = solid_yuv420p14_frame(64, 16, 9600, 6400, 11200);
+    let src = Yuv420p14Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32);
+
+    let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3];
+    let mut s_scalar = MixedSinker::<Yuv420p14>::new(64, 16)
+      .with_simd(false)
+      .with_rgb(&mut rgb_scalar)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_scalar)
+      .unwrap();
+    yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap();
+
+    let mut rgb_simd = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3];
+    let mut s_simd = MixedSinker::<Yuv420p14>::new(64, 16)
+      .with_rgb(&mut rgb_simd)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_simd)
+      .unwrap();
+    yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap();
+
+    assert_eq!(rgb_scalar, rgb_simd);
+    assert_eq!(rgb_u16_scalar, rgb_u16_simd);
+  }
+
+  // ---- P012 --------------------------------------------------------------
+  //
+  // Semi-planar 12-bit, high-bit-packed (samples in high 12 of each
+  // u16). Mirrors the P010 test shape — UV interleaved, `value << 4`.
+
+  fn solid_p012_frame(
+    width: u32,
+    height: u32,
+    y_12bit: u16,
+    u_12bit: u16,
+    v_12bit: u16,
+  ) -> (Vec<u16>, Vec<u16>) {
+    let w = width as usize;
+    let h = height as usize;
+    let cw = w / 2;
+    let ch = h / 2;
+    // Shift into the high 12 bits (P012 packing).
+    let y = std::vec![y_12bit << 4; w * h];
+    let uv: Vec<u16> = (0..cw * ch)
+      .flat_map(|_| [u_12bit << 4, v_12bit << 4])
+      .collect();
+    (y, uv)
+  }
+
+  #[test]
+  fn p012_rgb_u8_only_gray_is_gray() {
+    let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048);
+    let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb = std::vec![0u8; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P012>::new(16, 8).with_rgb(&mut rgb).unwrap();
+    p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+    }
+  }
+
+  #[test]
+  fn p012_rgb_u16_only_native_depth_gray() {
+    // Output is low-bit-packed 12-bit (yuv420p12le convention).
+    let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048);
+    let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P012>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .unwrap();
+    p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      assert!(
+        px[0] <= 4095,
+        "output must stay within 12-bit low-packed range"
+      );
+    }
+  }
+
+  #[test]
+  fn p012_rgb_u8_and_u16_both_populated() {
+    let (yp, uvp) = solid_p012_frame(16, 8, 4095, 2048, 2048);
+    let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+    let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P012>::new(16, 8)
+      .with_rgb(&mut rgb_u8)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap();
+    p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(rgb_u8.iter().all(|&c| c == 255));
+    assert!(rgb_u16.iter().all(|&c| c == 4095));
+  }
+
+  #[test]
+  fn p012_luma_downshifts_to_8bit() {
+    // Y=2048 at 12 bits, P012-packed (2048 << 4 = 0x8000). After >> 8,
+    // the 8-bit luma is 0x80 = 128 — same accessor as P010 since both
+    // store active bits in the high positions.
+    let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048);
+    let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut luma = std::vec![0u8; 16 * 8];
+    let mut sink = MixedSinker::<P012>::new(16, 8)
+      .with_luma(&mut luma)
+      .unwrap();
+    p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(luma.iter().all(|&l| l == 128));
+  }
+
+  #[test]
+  fn p012_matches_yuv420p12_mixed_sinker_with_shifted_samples() {
+    // Logical equivalence — same 12-bit samples fed through both
+    // layouts must produce byte-identical u8 RGB.
+    let w = 16u32;
+    let h = 8u32;
+    let y = 2400u16;
+    let u = 1600u16;
+    let v = 2800u16;
+
+    let (yp_p12, up_p12, vp_p12) = solid_yuv420p12_frame(w, h, y, u, v);
+    let src_p12 = Yuv420p12Frame::new(&yp_p12, &up_p12, &vp_p12, w, h, w, w / 2, w / 2);
+
+    let (yp_p012, uvp_p012) = solid_p012_frame(w, h, y, u, v);
+    let src_p012 = P012Frame::new(&yp_p012, &uvp_p012, w, h, w, w);
+
+    let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize];
+    let mut rgb_p012 = std::vec![0u8; (w * h * 3) as usize];
+    let mut s_yuv = MixedSinker::<Yuv420p12>::new(w as usize, h as usize)
+      .with_rgb(&mut rgb_yuv)
+      .unwrap();
+    let mut s_p012 = MixedSinker::<P012>::new(w as usize, h as usize)
+      .with_rgb(&mut rgb_p012)
+      .unwrap();
+    yuv420p12_to(&src_p12, true, ColorMatrix::Bt709, &mut s_yuv).unwrap();
+    p012_to(&src_p012, true, ColorMatrix::Bt709, &mut s_p012).unwrap();
+    assert_eq!(rgb_yuv, rgb_p012);
+  }
+
+  #[test]
+  fn p012_rgb_u16_too_short_returns_err() {
+    let mut rgb = std::vec![0u16; 10];
+    let err = MixedSinker::<P012>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .err()
+      .unwrap();
+    assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. }));
+  }
+
+  #[test]
+  fn p012_with_simd_false_matches_with_simd_true() {
+    let (yp, uvp) = solid_p012_frame(64, 16, 2400, 1600, 2800);
+    let src = P012Frame::new(&yp, &uvp, 64, 16, 64, 64);
+
+    let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3];
+    let mut s_scalar = MixedSinker::<P012>::new(64, 16)
+      .with_simd(false)
+      .with_rgb(&mut rgb_scalar)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_scalar)
+      .unwrap();
+    p012_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap();
+
+    let mut rgb_simd = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3];
+    let mut s_simd = MixedSinker::<P012>::new(64, 16)
+      .with_rgb(&mut rgb_simd)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_simd)
+      .unwrap();
+    p012_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap();
+
+    assert_eq!(rgb_scalar, rgb_simd);
+    assert_eq!(rgb_u16_scalar, rgb_u16_simd);
+  }
 }
diff --git a/src/sinker/mod.rs b/src/sinker/mod.rs
index e6d6d0a..90ce325 100644
--- a/src/sinker/mod.rs
+++ b/src/sinker/mod.rs
@@ -1,10 +1,11 @@
 //! [`PixelSink`](crate::PixelSink) implementations shipped with the
 //! crate.
 //!
-//! v0.1 ships [`MixedSinker`](mixed::MixedSinker), which writes any
-//! subset of `{RGB, Luma, HSV}` into caller-provided buffers. Narrow
-//! newtype shortcuts (luma-only, RGB-only, HSV-only) will be added in
-//! follow-up commits once the MixedSinker path is proven.
+//! Currently ships [`MixedSinker`](mixed::MixedSinker), which writes
+//! any subset of `{RGB, Luma, HSV}` into caller-provided buffers.
+//! It has per-format `PixelSink` impls for all eight shipped YUV
+//! source formats (see [`crate::yuv`] for the list). Narrow newtype
+//! shortcuts (luma-only, RGB-only, HSV-only) are a follow-up.
 //!
 //! `MixedSinker` keeps a lazily‑grown `Vec<u8>` scratch buffer for
 //! the HSV‑without‑RGB path, so it is only compiled under the `std`
diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs
index b3f1f4c..fcbe395 100644
--- a/src/yuv/mod.rs
+++ b/src/yuv/mod.rs
@@ -1,6 +1,9 @@
 //! YUV source kernels.
 //!
-//! One sub-module and kernel per YUV pixel-format family:
+//! One sub-module and kernel per YUV pixel-format family.
+//!
+//! # Shipped (8-bit 4:2:0)
+//!
 //! - [`Yuv420p`](crate::yuv::Yuv420p) — the mainline 4:2:0 **planar**
 //!   layout (H.264 / HEVC / AV1 / VP9 software‑decode default).
 //! - [`Nv12`](crate::yuv::Nv12) — 4:2:0 **semi‑planar** with interleaved
@@ -8,12 +11,18 @@
 //!   default).
 //! - [`Nv21`](crate::yuv::Nv21) — 4:2:0 semi‑planar with **VU**-ordered
 //!   chroma (Android MediaCodec default).
+//!
+//! # Shipped (high-bit-depth 4:2:0, low-bit-packed planar)
+//!
 //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits
 //!   per sample (HDR10 / 10‑bit SDR software decode).
 //! - [`Yuv420p12`](crate::yuv::Yuv420p12) — 4:2:0 planar at 12 bits
 //!   per sample (HEVC Main 12 / VP9 Profile 3 software decode).
 //! - [`Yuv420p14`](crate::yuv::Yuv420p14) — 4:2:0 planar at 14 bits
 //!   per sample (grading / mastering pipelines).
+//!
+//! # Shipped (high-bit-depth 4:2:0, high-bit-packed semi-planar)
+//!
 //! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per
 //!   sample, high‑bit‑packed (HDR hardware decode: VideoToolbox,
 //!   VA‑API, NVDEC, D3D11VA, Intel QSV).
@@ -21,7 +30,21 @@
 //!   sample, high‑bit‑packed (HEVC Main 12 / VP9 Profile 3 hardware
 //!   decode).
 //!
-//! Other families land in follow-up commits.
+//! # Not yet shipped
+//!
+//! - **16‑bit** (`Yuv420p16` / `P016`) — blocked on a separate
+//!   kernel family. At `BITS == 16` the Q15 chroma_sum overflows
+//!   i32, so this needs either i64 intermediates or a lower‑Q
+//!   coefficient format. The scalar and SIMD kernels here
+//!   deliberately gate `BITS` to `{10, 12, 14}` (planar) and
+//!   `{10, 12}` (semi‑planar) via `debug_assert!`.
+//! - **4:2:2 / 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`,
+//!   `Nv42`) — follow‑up, not yet started. They share the scalar
+//!   Q15 math but need their own row walkers (different chroma
+//!   subsampling / stride).
+//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`,
+//!   `Rgba1010102`, etc.) — follow‑up. Will land as their own
+//!   family of `*_to` kernels feeding a new row‑shape subtrait.
 
 mod nv12;
 mod nv21;
diff --git a/src/yuv/yuv420p10.rs b/src/yuv/yuv420p10.rs
index 1a85e06..812f180 100644
--- a/src/yuv/yuv420p10.rs
+++ b/src/yuv/yuv420p10.rs
@@ -4,14 +4,14 @@
 //! plus U / V at half width and half height — but sample width is
 //! **`u16`** (10 active bits in the low bits of each element). The
 //! [`Yuv420p10Frame`] type alias pins the bit depth; the underlying
-//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` so 12‑bit
-//! and 14‑bit variants can be added by relaxing its validator without
-//! changing kernel math.
+//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` and the
+//! 12‑bit / 14‑bit siblings ([`super::Yuv420p12`] / [`super::Yuv420p14`])
+//! reuse the same scalar + SIMD kernel family with a different
+//! monomorphization.
 //!
-//! Ships in colconv v0.2 as the first high‑bit‑depth format (HDR /
-//! 10‑bit SDR keystone). Kernel semantics match [`super::Yuv420p`]:
-//! two consecutive Y rows share one chroma row (4:2:0), chroma is
-//! nearest‑neighbor upsampled in registers inside the row primitive.
+//! Kernel semantics match [`super::Yuv420p`]: two consecutive Y rows
+//! share one chroma row (4:2:0), chroma is nearest‑neighbor upsampled
+//! in registers inside the row primitive.
 
 use crate::{
   ColorMatrix, PixelSink, SourceFormat,
@@ -22,10 +22,11 @@ use crate::{
 /// Zero‑sized marker for the YUV 4:2:0 **10‑bit** source format. Used
 /// as the `F` type parameter on [`crate::sinker::MixedSinker`].
 ///
-/// colconv v0.2 ships only the 10‑bit specialization; 12‑ and 14‑bit
-/// will arrive as separate markers (`Yuv420p12`, `Yuv420p14`) that
-/// refer to the same underlying [`Yuv420pFrame16`] struct with
-/// different `BITS` values.
+/// 12‑bit and 14‑bit siblings ship as separate markers
+/// ([`super::Yuv420p12`] / [`super::Yuv420p14`]) on the same
+/// [`Yuv420pFrame16`] struct with different `BITS` values. 16‑bit
+/// needs a different kernel family (Q15 chroma_sum overflows i32) and
+/// is not yet shipped.
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
 pub struct Yuv420p10;