From e5e7ea19c784d2e9105beff01111a782ee412a54 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:31:10 +1200 Subject: [PATCH 1/4] more simd backend --- src/row/arch/neon.rs | 200 +++++++++++++++++++---------------- src/row/arch/wasm_simd128.rs | 56 +++++----- src/row/arch/x86_avx2.rs | 56 +++++----- src/row/arch/x86_avx512.rs | 56 +++++----- src/row/arch/x86_sse41.rs | 64 ++++++----- src/row/mod.rs | 48 ++++----- src/row/scalar.rs | 101 ++++++++++-------- 7 files changed, 318 insertions(+), 263 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index f98d9cd..54e7a07 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -39,8 +39,8 @@ use core::arch::aarch64::{ vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, vld1q_u8, vld1q_u16, vld2_u8, vld2q_u16, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, - vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, - vshrq_n_u16, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, + vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshlq_u16, + vshrq_n_s32, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, }; use crate::{ColorMatrix, row::scalar}; @@ -190,7 +190,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } -/// NEON YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed +/// **8‑bit** RGB. /// /// Block size is 16 Y pixels / 8 chroma pairs per iteration. The /// pipeline mirrors [`yuv_420_to_rgb_row`] byte‑for‑byte; the only @@ -199,16 +200,20 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// (16 lanes of `u8`), so each Y iteration needs two Y loads to /// cover 16 pixels — there's no widening step because the samples /// already live in 16‑bit lanes. -/// - Chroma bias is **512** (10‑bit center) rather than 128. +/// - Chroma bias is `128 << (BITS - 8)` (512 for 10‑bit, 2048 for +/// 12‑bit, 8192 for 14‑bit) rather than 128. /// - Range‑scaling params come from [`scalar::range_params_n`] with -/// `BITS = 10, OUT_BITS = 8`, so `y_scale` / `c_scale` are ~¼ the -/// 8‑bit values (mapping 10‑bit input to 8‑bit output). +/// the matching `BITS` const, so `y_scale` / `c_scale` map the +/// source depth to 8‑bit output in a single Q15 shift. +/// - Each load is AND‑masked to the low `BITS` bits so out‑of‑range +/// samples (e.g. high‑bit‑packed data mistakenly handed to the +/// low‑packed kernel) produce deterministic, backend‑consistent +/// output. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`] — every -/// Q15 multiply / shift mirrors the scalar path exactly, with the -/// same `(prod + (1 << 14)) >> 15` rounding. +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::`] across +/// all supported bit depths. /// /// # Safety /// @@ -216,9 +221,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12, 14}` — the Q15 pipeline +/// overflows i32 at 16 bits; see [`scalar::range_params_n`]. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -234,8 +241,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: NEON availability is the caller's obligation; the @@ -248,7 +255,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); + let mask_v = vdupq_n_u16(scalar::bits_mask::()); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); let cgu = vdupq_n_s32(coeffs.g_u()); @@ -259,11 +266,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let mut x = 0usize; while x + 16 <= width { // Two Y loads cover 16 lanes; one U load + one V load cover 8 - // chroma each. Each load is AND‑masked to the low 10 bits so - // out‑of‑range samples (e.g. `p010`‑style packing with the - // 10 active bits in the high 10 of each u16) can never push - // an intermediate past i16 range. For valid input the AND is - // a no‑op (samples already in [0, 1023]). + // chroma each. Each load is AND‑masked to the low BITS bits so + // out‑of‑range samples (e.g. high‑bit‑packed data handed to + // the low‑packed kernel) can never push an intermediate past + // i16 range. For valid input the AND is a no‑op. let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); @@ -325,7 +331,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( // Scalar tail — remaining < 16 pixels (always even per 4:2:0). if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -338,24 +344,25 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } } -/// NEON YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB (native depth). +/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed +/// **native‑depth `u16`** RGB. /// /// Block size is 16 Y pixels / 8 chroma pairs per iteration. Shares -/// all pre‑write math with [`yuv420p10_to_rgb_row`]; the only +/// all pre‑write math with [`yuv_420p_n_to_rgb_row`]; the only /// difference is the final clamp + write: -/// - Y‑path scale is calibrated for `OUT_BITS = 10` rather than 8, -/// so `y_scaled` lives in `[0, 1023]` before the chroma add. -/// - The `y_scaled + chroma` sum is clamped to `[0, 1023]` with -/// `vmaxq_s16(vminq_s16(_, 1023), 0)` — a simple saturate‑narrow -/// doesn't suffice because the sum can overshoot 1023 (up to ~2046 -/// without saturating at i16 bounds). +/// - Y‑path scale is calibrated for `OUT_BITS = BITS` rather than 8, +/// so `y_scaled` lives in `[0, (1 << BITS) - 1]`. +/// - The `y_scaled + chroma` sum is clamped to `[0, (1 << BITS) - 1]` +/// with `vmaxq_s16(vminq_s16(_, max), 0)` — a simple saturate‑ +/// narrow doesn't suffice because the sum can overshoot the +/// `BITS`-bit max without saturating at i16 bounds. /// - Writes use two `vst3q_u16` calls per iteration — each handles 8 /// pixels × 3 channels = 24 `u16` elements, so two cover 16 pixels. /// /// # Numerical contract /// -/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`] — every -/// Q15 multiply / shift / clamp mirrors the scalar reference. +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::`] across +/// supported `BITS` values. /// /// # Safety /// @@ -363,9 +370,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -381,10 +389,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: NEON availability is the caller's obligation; the // dispatcher in `crate::row` verifies it. Pointer adds are bounded @@ -396,8 +404,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); - let max_v = vdupq_n_s16(OUT_MAX_10); + let mask_v = vdupq_n_u16(scalar::bits_mask::()); + let max_v = vdupq_n_s16(out_max); let zero_v = vdupq_n_s16(0); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); @@ -408,9 +416,9 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - // AND‑mask each load to the low 10 bits so intermediates stay - // within the i16 range the Q15 narrow steps expect — see - // matching comment in [`yuv420p10_to_rgb_row`]. + // AND‑mask each load to the low BITS bits so intermediates + // stay within the i16 range the Q15 narrow steps expect — see + // matching comment in [`yuv_420p_n_to_rgb_row`]. let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); @@ -447,9 +455,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); // Native‑depth output: add Y + chroma in i16, then clamp to - // [0, 1023] explicitly. `vqaddq_s16` saturates at i16 bounds - // (irrelevant here since |sum| < 2047 always), so the subsequent - // max/min clamps to the 10‑bit range. + // [0, (1 << BITS) - 1] explicitly. `vqaddq_s16` saturates at + // i16 bounds (irrelevant here: |sum| stays well inside i16 + // for BITS ≤ 14), so the subsequent max/min clamps to the + // native bit depth. let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v); let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v); let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v); @@ -467,7 +476,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -481,33 +490,35 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } /// Clamps an i16x8 vector to `[0, max]` and reinterprets to u16x8. -/// Used by the 10‑bit u16 output path to avoid `vqmovun_s16`'s u8 -/// saturation. +/// Used by the native‑depth u16 output paths to avoid `vqmovun_s16`'s +/// u8 saturation. #[inline(always)] fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) } } -/// NEON P010 → packed **8‑bit** RGB. +/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}: P010, P012) +/// → packed **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences -/// from [`yuv420p10_to_rgb_row`]: +/// from [`yuv_420p_n_to_rgb_row`]: /// - UV is semi‑planar interleaved (`U0, V0, U1, V1, …`), split in /// one shot via `vld2q_u16` (returns separate U and V vectors). -/// - Each `u16` load is **shifted right by 6** (`vshrq_n_u16::<6>`) -/// instead of AND‑masked — P010 packs its 10 active bits in the -/// HIGH 10 of each `u16`, so `>> 6` extracts the value and -/// simultaneously clears the low 6 bits (which the format mandates -/// are zero anyway; the shift makes mispacked input deterministic). -/// - Chroma bias is 512 (10‑bit center) after the shift. +/// - Each `u16` load is **right‑shifted by `16 - BITS`** — 6 for +/// P010, 4 for P012 — extracting the `BITS` active bits from the +/// high bits of each `u16` and clearing the low bits. The shift +/// runs via `vshlq_u16` with a negative loop‑invariant count so a +/// single kernel serves all supported bit depths. /// /// After the shift, the rest of the pipeline is identical to the -/// `yuv420p10` path — same `chroma_i16x8` / `scale_y` / `chroma_dup` -/// / `vst3q_u8` write, with `range_params_n::<10, 8>` scaling. +/// low‑bit‑packed planar path — same `chroma_i16x8` / `scale_y` / +/// `chroma_dup` / `vst3q_u8` write, with `range_params_n::` +/// scaling. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] across all +/// supported `BITS` values. /// /// # Safety /// @@ -515,9 +526,10 @@ fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -531,8 +543,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: NEON availability is the caller's obligation. @@ -542,6 +554,9 @@ pub(crate) unsafe fn p010_to_rgb_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); + // `vshlq_u16` performs right shift when the count is negative. + // Count = -(16 - BITS) extracts the `BITS` active high bits. + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); let cgu = vdupq_n_s32(coeffs.g_u()); @@ -551,17 +566,16 @@ pub(crate) unsafe fn p010_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - // 16 Y pixels in two u16x8 loads, shifted right by 6 to extract - // the 10‑bit values from P010's high‑bit packing. - let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); - let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + // 16 Y pixels in two u16x8 loads, right-shifted by 16-BITS to + // extract the active bits from the high-bit packing. + let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count); + let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count); // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements - // and returns (evens, odds) = (U, V) in one shot. Each gets the - // same `>> 6` shift as Y. + // and returns (evens, odds) = (U, V) in one shot. let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = vshrq_n_u16::<6>(uv_pair.0); - let v_vec = vshrq_n_u16::<6>(uv_pair.1); + let u_vec = vshlq_u16(uv_pair.0, shr_count); + let v_vec = vshlq_u16(uv_pair.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -613,7 +627,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -625,8 +639,9 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// NEON P010 → packed **10‑bit `u16`** RGB (native‑depth, low‑bit‑ -/// packed output — `yuv420p10le` convention, not P010). +/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, +/// `yuv420p10le` / `yuv420p12le` convention — not P010/P012). /// /// Same structure as [`p010_to_rgb_row`] up to the chroma compute; /// the only differences are: @@ -638,7 +653,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. /// /// # Safety /// @@ -648,7 +663,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -662,10 +677,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: NEON availability is the caller's obligation. unsafe { @@ -674,7 +689,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let max_v = vdupq_n_s16(OUT_MAX_10); + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + let max_v = vdupq_n_s16(out_max); let zero_v = vdupq_n_s16(0); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); @@ -685,11 +701,11 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); - let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count); + let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count); let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = vshrq_n_u16::<6>(uv_pair.0); - let v_vec = vshrq_n_u16::<6>(uv_pair.1); + let u_vec = vshlq_u16(uv_pair.0, shr_count); + let v_vec = vshlq_u16(uv_pair.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -737,7 +753,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1705,7 +1721,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -1730,7 +1746,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -1851,7 +1867,7 @@ mod tests { full_range, ); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -1870,7 +1886,7 @@ mod tests { full_range, ); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -1913,9 +1929,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -1938,9 +1954,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -2036,9 +2052,9 @@ mod tests { for full_range in [true, false] { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -2047,9 +2063,9 @@ mod tests { let mut rgb16_scalar = std::vec![0u16; width * 3]; let mut rgb16_neon = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb16_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 21efa7c..35f068e 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -214,7 +214,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -230,8 +230,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: simd128 compile‑time availability is the caller's @@ -242,7 +242,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let mask_v = u16x8_splat(scalar::bits_mask::<10>()); + let mask_v = u16x8_splat(scalar::bits_mask::()); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); let cgu = i32x4_splat(coeffs.g_u()); @@ -303,7 +303,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -334,7 +334,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -350,8 +350,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -363,7 +363,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let mask_v = u16x8_splat(scalar::bits_mask::<10>()); + let mask_v = u16x8_splat(scalar::bits_mask::()); let max_v = i16x8_splat(OUT_MAX_10); let zero_v = i16x8_splat(0); let cru = i32x4_splat(coeffs.r_u()); @@ -424,7 +424,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -511,7 +511,7 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. /// /// # Safety /// @@ -521,7 +521,7 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -535,8 +535,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: simd128 compile‑time availability is the caller's @@ -606,7 +606,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -623,7 +623,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. /// /// # Safety /// @@ -633,7 +633,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -647,8 +647,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -719,7 +719,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1609,7 +1609,7 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1634,7 +1634,15 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1724,7 +1732,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } @@ -1738,7 +1746,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index e5b6db7..a0f0e8d 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -234,7 +234,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -250,8 +250,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX2 availability is the caller's obligation. @@ -261,7 +261,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -338,7 +338,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -375,7 +375,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -391,8 +391,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -403,7 +403,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::() as i16); let max_v = _mm256_set1_epi16(OUT_MAX_10); let zero_v = _mm256_set1_epi16(0); let cru = _mm256_set1_epi32(coeffs.r_u()); @@ -505,7 +505,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -538,7 +538,7 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. /// /// # Safety /// @@ -548,7 +548,7 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -562,8 +562,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX2 availability is the caller's obligation. @@ -644,7 +644,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -661,7 +661,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. /// /// # Safety /// @@ -671,7 +671,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -685,8 +685,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -787,7 +787,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1608,7 +1608,7 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1636,7 +1636,15 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1729,7 +1737,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } @@ -1746,7 +1754,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 19caeaa..8e4ece2 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -251,7 +251,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -267,8 +267,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX‑512BW availability is the caller's obligation. @@ -278,7 +278,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -358,7 +358,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -391,7 +391,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -407,8 +407,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -419,7 +419,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::() as i16); let max_v = _mm512_set1_epi16(OUT_MAX_10); let zero_v = _mm512_set1_epi16(0); let cru = _mm512_set1_epi32(coeffs.r_u()); @@ -508,7 +508,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -585,7 +585,7 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. /// /// # Safety /// @@ -595,7 +595,7 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -609,8 +609,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX‑512BW availability is the caller's obligation. @@ -692,7 +692,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -709,7 +709,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. /// /// # Safety /// @@ -719,7 +719,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -733,8 +733,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -823,7 +823,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1645,7 +1645,7 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1673,7 +1673,15 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1766,7 +1774,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } @@ -1783,7 +1791,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 66e385b..b91d98e 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -37,10 +37,10 @@ use core::arch::x86_64::{ __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, - _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32, - _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, - _mm_srai_epi32, _mm_srli_epi16, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, - _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64, + _mm_cvtsi32_si128, _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, + _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, + _mm_setr_epi8, _mm_shuffle_epi8, _mm_srai_epi32, _mm_srl_epi16, _mm_srli_si128, _mm_sub_epi16, + _mm_unpackhi_epi16, _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64, }; use crate::{ @@ -206,7 +206,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. /// /// # Safety /// @@ -216,7 +216,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -230,8 +230,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: SSE4.1 availability is the caller's obligation. @@ -304,7 +304,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -321,7 +321,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. /// /// # Safety /// @@ -331,7 +331,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -345,8 +345,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -415,7 +415,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -486,7 +486,7 @@ unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) { /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -502,8 +502,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: SSE4.1 availability is the caller's obligation; the @@ -516,7 +516,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -579,7 +579,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -614,7 +614,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -630,8 +630,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; const OUT_MAX_10: i16 = 1023; @@ -642,7 +642,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm_set1_epi16(scalar::bits_mask::() as i16); let max_v = _mm_set1_epi16(OUT_MAX_10); let zero_v = _mm_set1_epi16(0); let cru = _mm_set1_epi32(coeffs.r_u()); @@ -708,7 +708,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -1387,7 +1387,7 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1415,7 +1415,15 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } @@ -1508,7 +1516,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } @@ -1525,7 +1533,7 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); } diff --git a/src/row/mod.rs b/src/row/mod.rs index 80afab7..f6257fc 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -350,7 +350,7 @@ pub fn yuv420p10_to_rgb_row( // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). unsafe { - arch::neon::yuv420p10_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -359,7 +359,7 @@ pub fn yuv420p10_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv420p10_to_rgb_row( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -368,7 +368,7 @@ pub fn yuv420p10_to_rgb_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv420p10_to_rgb_row( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -377,7 +377,7 @@ pub fn yuv420p10_to_rgb_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv420p10_to_rgb_row( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -388,7 +388,7 @@ pub fn yuv420p10_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv420p10_to_rgb_row( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -444,7 +444,7 @@ pub fn yuv420p10_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv420p10_to_rgb_u16_row( + arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -455,7 +455,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv420p10_to_rgb_u16_row( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -464,7 +464,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv420p10_to_rgb_u16_row( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -473,7 +473,7 @@ pub fn yuv420p10_to_rgb_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv420p10_to_rgb_u16_row( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -484,7 +484,7 @@ pub fn yuv420p10_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv420p10_to_rgb_u16_row( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -504,7 +504,7 @@ pub fn yuv420p10_to_rgb_u16_row( /// /// This is the HDR hardware‑decode keystone format: VideoToolbox, /// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit -/// output. See `scalar::p010_to_rgb_row` for the full semantic +/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic /// specification. `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -529,7 +529,7 @@ pub fn p010_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -538,21 +538,21 @@ pub fn p010_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -561,7 +561,7 @@ pub fn p010_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -570,7 +570,7 @@ pub fn p010_to_rgb_row( } } - scalar::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P010** to **native‑depth `u16`** packed RGB @@ -579,7 +579,7 @@ pub fn p010_to_rgb_row( /// Callers feeding this output into a P010 consumer must shift left /// by 6. /// -/// See `scalar::p010_to_rgb_u16_row` for the full spec. +/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. /// `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -604,7 +604,7 @@ pub fn p010_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -613,21 +613,21 @@ pub fn p010_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -636,7 +636,7 @@ pub fn p010_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p010_to_rgb_u16_row( + arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( y, uv_half, rgb_out, width, matrix, full_range, ); } @@ -647,7 +647,7 @@ pub fn p010_to_rgb_u16_row( } } - scalar::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 527ea4d..26759c9 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -347,16 +347,17 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( // ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------ /// Converts one row of P010 (semi‑planar 4:2:0 with UV interleaved, -/// 10 active bits in the **high** 10 of each `u16`) to **8‑bit** -/// packed RGB. +/// `BITS` active bits in the **high** `BITS` of each `u16`) to +/// **8‑bit** packed RGB. /// /// Structurally identical to [`nv12_to_rgb_row`] plus the per‑sample -/// shift: each `u16` load is extracted to its 10‑bit value via -/// `sample >> 6`, then the same Q15 pipeline as -/// [`yuv_420p_n_to_rgb_row`] runs with `BITS == 10`. Mispacked input -/// — e.g. a `yuv420p10le` buffer with values in the **low** 10 bits -/// — is masked down to a small positive number (producing near‑black -/// output) rather than silent garbage, matching every SIMD backend. +/// shift: each `u16` load is extracted to its `BITS`‑bit value via +/// `sample >> (16 - BITS)`, then the same Q15 pipeline as +/// [`yuv_420p_n_to_rgb_row`] runs with the same `BITS`. For `BITS == +/// 10` this is P010 (`>> 6`); for `BITS == 12` it's P012 (`>> 4`). +/// Mispacked input — e.g. a low‑bit‑packed buffer handed to this +/// kernel — has its active low bits discarded (producing near‑black +/// output), matching every SIMD backend. /// /// # Panics (debug builds) /// @@ -364,7 +365,7 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p010_to_rgb_row( +pub(crate) fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -372,26 +373,28 @@ pub(crate) fn p010_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - debug_assert_eq!(width & 1, 0, "P010 requires even width"); + debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); let coeffs = Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range); - let bias = chroma_bias::<10>(); - - // Each `u16` load is converted to its 10-bit sample with `>> 6`, - // extracting the upper 10 bits and leaving the result in - // `[0, 1023]`. If low-packed input (`yuv420p10le`) is handed to - // this kernel by mistake, that shift discards the active low 6 bits - // rather than recovering the intended 10-bit value. No hot-path - // cost: one shift per load. + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let shift = 16 - BITS; + + // Each `u16` load is converted to its `BITS`-bit sample with + // `>> (16 - BITS)` — 6 for P010, 4 for P012. Extracts the upper + // bits and leaves the result in `[0, (1 << BITS) - 1]`. If + // low-packed input (`yuv420p10le`, `yuv420p12le`) is handed to + // this kernel by mistake, the shift discards the active low bits + // rather than recovering the intended value. No hot-path cost: + // one shift per load. let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> 6; - let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_sample = uv_half[c_idx * 2] >> shift; + let v_sample = uv_half[c_idx * 2 + 1] >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -399,12 +402,12 @@ pub(crate) fn p010_to_rgb_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); rgb_out[x * 3] = clamp_u8(y0 + r_chroma); rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); - let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); @@ -413,15 +416,18 @@ pub(crate) fn p010_to_rgb_row( } } -/// Converts one row of P010 to **native‑depth `u16`** packed RGB -/// (10 active bits in the low bits of each `u16`, matching -/// `yuv420p10le` convention — **not** P010's high‑bit packing). +/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 +/// (`BITS` ∈ {10, 12}: P010, P012) to **native‑depth `u16`** +/// packed RGB — samples are **low‑bit‑packed** on output +/// (`[0, (1 << BITS) - 1]` in the low bits of each `u16`, upper bits +/// zero), matching the `yuv420p10le` / `yuv420p12le` convention — +/// **not** the P010/P012 high‑bit packing. Callers feeding a P010/ +/// P012 consumer must shift the output left by `16 - BITS`. /// -/// Mirrors [`yuv_420p_n_to_rgb_u16_row::<10>`] on the math side; the -/// only difference is the input shift (`sample >> 6` instead of -/// `sample & 0x3FF`) and the UV deinterleave. Output is suitable for -/// direct consumption by downstream `yuv420p10le`‑shaped tooling. If -/// you need P010‑packed RGB output, shift left by 6 on the caller. +/// Mirrors [`yuv_420p_n_to_rgb_u16_row`] on the math side; the only +/// differences are the input shift (`sample >> (16 - BITS)` to +/// extract the `BITS`-bit value from the high-bit packing) and the +/// interleaved UV layout. /// /// # Panics (debug builds) /// @@ -429,7 +435,7 @@ pub(crate) fn p010_to_rgb_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p010_to_rgb_u16_row( +pub(crate) fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -437,21 +443,22 @@ pub(crate) fn p010_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - debug_assert_eq!(width & 1, 0, "P010 requires even width"); + debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); let coeffs = Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = range_params_n::<10, 10>(full_range); - let bias = chroma_bias::<10>(); - let out_max: i32 = (1i32 << 10) - 1; + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let out_max: i32 = (1i32 << BITS) - 1; + let shift = 16 - BITS; let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> 6; - let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_sample = uv_half[c_idx * 2] >> shift; + let v_sample = uv_half[c_idx * 2 + 1] >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -459,12 +466,12 @@ pub(crate) fn p010_to_rgb_u16_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; - let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; @@ -1131,7 +1138,7 @@ mod tests { let y = [0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1 let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); } @@ -1141,7 +1148,7 @@ mod tests { let y = [0xFFC0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); } @@ -1151,7 +1158,7 @@ mod tests { let y = [0x8000u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); assert_eq!(r, g); @@ -1167,7 +1174,7 @@ mod tests { let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); @@ -1196,7 +1203,7 @@ mod tests { ColorMatrix::Bt709, true, ); - p010_to_rgb_row( + p_n_to_rgb_row::<10>( &y_p010, &uv_p010, &mut rgb_p010, @@ -1214,7 +1221,7 @@ mod tests { let y = [0xFFC0u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u16; 12]; - p010_to_rgb_u16_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); } @@ -1223,7 +1230,7 @@ mod tests { let y = [0x1000u16, 0xEB00]; let uv = [0x8000u16, 0x8000]; let mut rgb = [0u16; 6]; - p010_to_rgb_u16_row(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); } From 24c900f03381cffdfa1ea9ae90098974dafbb968 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:45:29 +1200 Subject: [PATCH 2/4] more simd backend --- src/row/arch/wasm_simd128.rs | 54 ++++++++++++++-------------- src/row/arch/x86_avx2.rs | 69 ++++++++++++++++++------------------ src/row/arch/x86_avx512.rs | 62 ++++++++++++++++---------------- src/row/arch/x86_sse41.rs | 57 +++++++++++++++-------------- 4 files changed, 118 insertions(+), 124 deletions(-) diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 35f068e..9a887e1 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -353,7 +353,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: simd128 compile‑time availability is the caller's // obligation. @@ -364,7 +364,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); let mask_v = u16x8_splat(scalar::bits_mask::()); - let max_v = i16x8_splat(OUT_MAX_10); + let max_v = i16x8_splat(out_max); let zero_v = i16x8_splat(0); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); @@ -554,13 +554,16 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr = (16 - BITS) as u32; + let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); - let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); - let u_vec = u16x8_shr(u_vec, 6); - let v_vec = u16x8_shr(v_vec, 6); + let u_vec = u16x8_shr(u_vec, shr); + let v_vec = u16x8_shr(v_vec, shr); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -650,7 +653,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: simd128 compile‑time availability is the caller's // obligation. @@ -660,7 +663,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let max_v = i16x8_splat(OUT_MAX_10); + let max_v = i16x8_splat(out_max); let zero_v = i16x8_splat(0); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); @@ -669,13 +672,16 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr = (16 - BITS) as u32; + let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); - let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); - let u_vec = u16x8_shr(u_vec, 6); - let v_vec = u16x8_shr(v_vec, 6); + let u_vec = u16x8_shr(u_vec, shr); + let v_vec = u16x8_shr(v_vec, shr); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -1609,9 +1615,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1634,17 +1640,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::( - &y, - &u, - &v, - &mut rgb_scalar, - width, - matrix, - full_range, - ); + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1732,9 +1730,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges"); } @@ -1746,9 +1744,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges"); } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index a0f0e8d..c4e5a12 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -39,12 +39,13 @@ //! element order. Every fixup is called out inline. use core::arch::x86_64::{ - __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_and_si256, - _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, - _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32, - _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16, - _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srli_epi16, - _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, + __m256i, _mm_cvtsi32_si128, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, + _mm256_and_si256, _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, + _mm256_extracti128_si256, _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, + _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256, + _mm256_permute4x64_epi64, _mm256_set1_epi16, _mm256_set1_epi32, _mm256_setr_epi8, + _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srl_epi16, _mm256_sub_epi16, + _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, }; use crate::{ @@ -394,7 +395,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX2 availability is the caller's obligation. unsafe { @@ -404,7 +405,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); let mask_v = _mm256_set1_epi16(scalar::bits_mask::() as i16); - let max_v = _mm256_set1_epi16(OUT_MAX_10); + let max_v = _mm256_set1_epi16(out_max); let zero_v = _mm256_set1_epi16(0); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); @@ -573,6 +574,8 @@ pub(crate) unsafe fn p_n_to_rgb_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -582,14 +585,15 @@ pub(crate) unsafe fn p_n_to_rgb_row( let mut x = 0usize; while x + 32 <= width { - // 32 Y = two u16×16 loads, shifted right by 6. - let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + // 32 Y = two u16×16 loads, shifted right by `16 - BITS`. + let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count); // 32 UV (16 pairs) — deinterleave + shift. let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); - let u_vec = _mm256_srli_epi16::<6>(u_vec); - let v_vec = _mm256_srli_epi16::<6>(v_vec); + let u_vec = _mm256_srl_epi16(u_vec, shr_count); + let v_vec = _mm256_srl_epi16(v_vec, shr_count); let u_i16 = _mm256_sub_epi16(u_vec, bias_v); let v_i16 = _mm256_sub_epi16(v_vec, bias_v); @@ -688,7 +692,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX2 availability is the caller's obligation. unsafe { @@ -697,8 +701,10 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let max_v = _mm256_set1_epi16(OUT_MAX_10); + let max_v = _mm256_set1_epi16(out_max); let zero_v = _mm256_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -708,11 +714,12 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { - let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); - let u_vec = _mm256_srli_epi16::<6>(u_vec); - let v_vec = _mm256_srli_epi16::<6>(v_vec); + let u_vec = _mm256_srl_epi16(u_vec, shr_count); + let v_vec = _mm256_srl_epi16(v_vec, shr_count); let u_i16 = _mm256_sub_epi16(u_vec, bias_v); let v_i16 = _mm256_sub_epi16(v_vec, bias_v); @@ -1608,9 +1615,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1636,17 +1643,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::( - &y, - &u, - &v, - &mut rgb_scalar, - width, - matrix, - full_range, - ); + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1737,9 +1736,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges"); } @@ -1754,9 +1753,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges"); } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 8e4ece2..6b0dbe9 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -53,13 +53,13 @@ //! two 32‑Y‑block‑aligned vectors from unpacklo + unpackhi. use core::arch::x86_64::{ - __m128i, __m512i, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, _mm512_adds_epi16, - _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256, - _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, + __m128i, __m512i, _mm_cvtsi32_si128, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, + _mm512_adds_epi16, _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, + _mm512_castsi512_si256, _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16, _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64, _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64, - _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srli_epi16, _mm512_sub_epi16, + _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srl_epi16, _mm512_sub_epi16, _mm512_unpackhi_epi16, _mm512_unpacklo_epi16, }; @@ -410,7 +410,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX‑512BW availability is the caller's obligation. unsafe { @@ -420,7 +420,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); let mask_v = _mm512_set1_epi16(scalar::bits_mask::() as i16); - let max_v = _mm512_set1_epi16(OUT_MAX_10); + let max_v = _mm512_set1_epi16(out_max); let zero_v = _mm512_set1_epi16(0); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); @@ -620,6 +620,8 @@ pub(crate) unsafe fn p_n_to_rgb_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -633,11 +635,12 @@ pub(crate) unsafe fn p_n_to_rgb_row( let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); - let u_vec = _mm512_srli_epi16::<6>(u_vec); - let v_vec = _mm512_srli_epi16::<6>(v_vec); + let u_vec = _mm512_srl_epi16(u_vec, shr_count); + let v_vec = _mm512_srl_epi16(v_vec, shr_count); let u_i16 = _mm512_sub_epi16(u_vec, bias_v); let v_i16 = _mm512_sub_epi16(v_vec, bias_v); @@ -736,7 +739,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX‑512BW availability is the caller's obligation. unsafe { @@ -745,8 +748,10 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let max_v = _mm512_set1_epi16(OUT_MAX_10); + let max_v = _mm512_set1_epi16(out_max); let zero_v = _mm512_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -760,11 +765,12 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); - let u_vec = _mm512_srli_epi16::<6>(u_vec); - let v_vec = _mm512_srli_epi16::<6>(v_vec); + let u_vec = _mm512_srl_epi16(u_vec, shr_count); + let v_vec = _mm512_srl_epi16(v_vec, shr_count); let u_i16 = _mm512_sub_epi16(u_vec, bias_v); let v_i16 = _mm512_sub_epi16(v_vec, bias_v); @@ -1645,9 +1651,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1673,17 +1679,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::( - &y, - &u, - &v, - &mut rgb_scalar, - width, - matrix, - full_range, - ); + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1774,9 +1772,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges"); } @@ -1791,9 +1789,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges"); } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index b91d98e..b81a060 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -241,6 +241,10 @@ pub(crate) unsafe fn p_n_to_rgb_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS` to extract + // the BITS-bit value. Loop-invariant, loaded once into the low 64b + // of `shr_count` for `_mm_srl_epi16`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -250,15 +254,15 @@ pub(crate) unsafe fn p_n_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - // Y: two u16×8 loads, each shifted right by 6. - let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + // Y: two u16×8 loads, each shifted right by `16 - BITS`. + let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count); // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then // deinterleave into separate u_vec + v_vec. let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); - let u_vec = _mm_srli_epi16::<6>(u_vec); - let v_vec = _mm_srli_epi16::<6>(v_vec); + let u_vec = _mm_srl_epi16(u_vec, shr_count); + let v_vec = _mm_srl_epi16(v_vec, shr_count); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -348,7 +352,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: SSE4.1 availability is the caller's obligation. unsafe { @@ -357,8 +361,11 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let max_v = _mm_set1_epi16(OUT_MAX_10); + let max_v = _mm_set1_epi16(out_max); let zero_v = _mm_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS` to extract + // the BITS-bit value. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -368,11 +375,11 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); - let u_vec = _mm_srli_epi16::<6>(u_vec); - let v_vec = _mm_srli_epi16::<6>(v_vec); + let u_vec = _mm_srl_epi16(u_vec, shr_count); + let v_vec = _mm_srl_epi16(v_vec, shr_count); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -633,7 +640,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: SSE4.1 availability is the caller's obligation. unsafe { @@ -643,7 +650,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); let mask_v = _mm_set1_epi16(scalar::bits_mask::() as i16); - let max_v = _mm_set1_epi16(OUT_MAX_10); + let max_v = _mm_set1_epi16(out_max); let zero_v = _mm_set1_epi16(0); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); @@ -1387,9 +1394,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1415,17 +1422,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::( - &y, - &u, - &v, - &mut rgb_scalar, - width, - matrix, - full_range, - ); + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1516,9 +1515,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges"); } @@ -1533,9 +1532,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges"); } From d917d8effb3eb5a3877231eda9c76fc8a59d2a7a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 19 Apr 2026 23:15:09 +1200 Subject: [PATCH 3/4] more simd backend --- Cargo.toml | 12 + benches/p012_to_rgb.rs | 94 ++++++ benches/yuv_420p12_to_rgb.rs | 100 ++++++ benches/yuv_420p14_to_rgb.rs | 100 ++++++ src/frame.rs | 152 ++++++--- src/row/arch/neon.rs | 152 +++++++++ src/row/arch/wasm_simd128.rs | 155 ++++++++++ src/row/arch/x86_avx2.rs | 164 ++++++++++ src/row/arch/x86_avx512.rs | 167 ++++++++++ src/row/arch/x86_sse41.rs | 176 +++++++++++ src/row/mod.rs | 445 +++++++++++++++++++++++++++ src/sinker/mixed.rs | 579 ++++++++++++++++++++++++++++++++++- src/yuv/mod.rs | 13 + src/yuv/p012.rs | 152 +++++++++ src/yuv/yuv420p12.rs | 161 ++++++++++ src/yuv/yuv420p14.rs | 159 ++++++++++ 16 files changed, 2730 insertions(+), 51 deletions(-) create mode 100644 benches/p012_to_rgb.rs create mode 100644 benches/yuv_420p12_to_rgb.rs create mode 100644 benches/yuv_420p14_to_rgb.rs create mode 100644 src/yuv/p012.rs create mode 100644 src/yuv/yuv420p12.rs create mode 100644 src/yuv/yuv420p14.rs diff --git a/Cargo.toml b/Cargo.toml index 458d138..4c98087 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,10 +32,22 @@ harness = false name = "yuv_420p10_to_rgb" harness = false +[[bench]] +name = "yuv_420p12_to_rgb" +harness = false + +[[bench]] +name = "yuv_420p14_to_rgb" +harness = false + [[bench]] name = "p010_to_rgb" harness = false +[[bench]] +name = "p012_to_rgb" +harness = false + [[bench]] name = "rgb_to_hsv" harness = false diff --git a/benches/p012_to_rgb.rs b/benches/p012_to_rgb.rs new file mode 100644 index 0000000..9443f6f --- /dev/null +++ b/benches/p012_to_rgb.rs @@ -0,0 +1,94 @@ +//! Per‑row P012 (semi‑planar 4:2:0, 12‑bit, high‑bit‑packed) → RGB +//! throughput baseline. +//! +//! Mirrors [`p010_to_rgb`] but feeds 12‑bit high‑bit‑packed samples +//! (12 active bits in the high 12 of each `u16`, low 4 bits zero). + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{p012_to_rgb_row, p012_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic P012‑packed pseudo‑random +/// sequence — 12‑bit values shifted into the high 12 bits of each +/// `u16` (low 4 bits zero), matching the real P012 storage layout. +fn fill_pseudo_random_p012(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = (((state >> 8) & 0xFFF) as u16) << 4; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("p012_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + // UV row payload is `width` u16 elements (w / 2 interleaved pairs). + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p012(&mut y, 0x1111); + fill_pseudo_random_p012(&mut uv, 0x2222); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p012_to_rgb_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("p012_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p012(&mut y, 0x1111); + fill_pseudo_random_p012(&mut uv, 0x2222); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p012_to_rgb_u16_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/benches/yuv_420p12_to_rgb.rs b/benches/yuv_420p12_to_rgb.rs new file mode 100644 index 0000000..cba3e28 --- /dev/null +++ b/benches/yuv_420p12_to_rgb.rs @@ -0,0 +1,100 @@ +//! Per‑row YUV 4:2:0 12‑bit → packed RGB throughput baseline. +//! +//! Mirrors [`yuv_420p10_to_rgb`] but feeds 12‑bit low‑bit‑packed +//! samples (values ≤ 4095). Same `u8_*` / `u16_*` split per width so +//! scalar vs SIMD speedup is a two‑line comparison in the Criterion +//! report. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic 12‑bit pseudo‑random +/// sequence — values occupy the low 12 bits of each `u16`, matching +/// the storage layout of `yuv420p12le`. +fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = ((state >> 8) & 0xFFF) as u16; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("yuv420p12_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p12_to_rgb_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("yuv420p12_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p12_to_rgb_u16_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/benches/yuv_420p14_to_rgb.rs b/benches/yuv_420p14_to_rgb.rs new file mode 100644 index 0000000..ac6e5ee --- /dev/null +++ b/benches/yuv_420p14_to_rgb.rs @@ -0,0 +1,100 @@ +//! Per‑row YUV 4:2:0 14‑bit → packed RGB throughput baseline. +//! +//! Mirrors [`yuv_420p10_to_rgb`] but feeds 14‑bit low‑bit‑packed +//! samples (values ≤ 16383). Same `u8_*` / `u16_*` split per width so +//! scalar vs SIMD speedup is a two‑line comparison in the Criterion +//! report. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic 14‑bit pseudo‑random +/// sequence — values occupy the low 14 bits of each `u16`, matching +/// the storage layout of `yuv420p14le`. +fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = ((state >> 8) & 0x3FFF) as u16; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("yuv420p14_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p14_to_rgb_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("yuv420p14_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p14_to_rgb_u16_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/src/frame.rs b/src/frame.rs index 8becc63..2c1997b 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -519,7 +519,7 @@ pub enum Nv12FrameError { /// — wrong colors, but consistently wrong across scalar + every /// SIMD backend, which is visible in any output diff. #[derive(Debug, Clone, Copy)] -pub struct P010Frame<'a> { +pub struct PnFrame<'a, const BITS: u32> { y: &'a [u16], uv: &'a [u16], width: u32, @@ -528,7 +528,7 @@ pub struct P010Frame<'a> { uv_stride: u32, } -impl<'a> P010Frame<'a> { +impl<'a, const BITS: u32> PnFrame<'a, BITS> { /// Constructs a new [`P010Frame`], validating dimensions and plane /// lengths. Strides are in `u16` **samples**. /// @@ -548,19 +548,26 @@ impl<'a> P010Frame<'a> { height: u32, y_stride: u32, uv_stride: u32, - ) -> Result { + ) -> Result { + // Guard the `BITS` parameter at the top — 10 and 12 are the only + // high-bit-packed depths supported by the Q15 kernel family. 14 + // exists in the planar `yuv420p14le` family but not as a Pn + // hardware output; 16 would need i64 intermediates. + if BITS != 10 && BITS != 12 { + return Err(PnFrameError::UnsupportedBits { bits: BITS }); + } if width == 0 || height == 0 { - return Err(P010FrameError::ZeroDimension { width, height }); + return Err(PnFrameError::ZeroDimension { width, height }); } if width & 1 != 0 { - return Err(P010FrameError::OddWidth { width }); + return Err(PnFrameError::OddWidth { width }); } if y_stride < width { - return Err(P010FrameError::YStrideTooSmall { width, y_stride }); + return Err(PnFrameError::YStrideTooSmall { width, y_stride }); } let uv_row_elems = width; if uv_stride < uv_row_elems { - return Err(P010FrameError::UvStrideTooSmall { + return Err(PnFrameError::UvStrideTooSmall { uv_row_elems, uv_stride, }); @@ -569,14 +576,14 @@ impl<'a> P010Frame<'a> { let y_min = match (y_stride as usize).checked_mul(height as usize) { Some(v) => v, None => { - return Err(P010FrameError::GeometryOverflow { + return Err(PnFrameError::GeometryOverflow { stride: y_stride, rows: height, }); } }; if y.len() < y_min { - return Err(P010FrameError::YPlaneTooShort { + return Err(PnFrameError::YPlaneTooShort { expected: y_min, actual: y.len(), }); @@ -585,14 +592,14 @@ impl<'a> P010Frame<'a> { let uv_min = match (uv_stride as usize).checked_mul(chroma_height as usize) { Some(v) => v, None => { - return Err(P010FrameError::GeometryOverflow { + return Err(PnFrameError::GeometryOverflow { stride: uv_stride, rows: chroma_height, }); } }; if uv.len() < uv_min { - return Err(P010FrameError::UvPlaneTooShort { + return Err(PnFrameError::UvPlaneTooShort { expected: uv_min, actual: uv.len(), }); @@ -621,7 +628,7 @@ impl<'a> P010Frame<'a> { ) -> Self { match Self::try_new(y, uv, width, height, y_stride, uv_stride) { Ok(frame) => frame, - Err(_) => panic!("invalid P010Frame dimensions or plane lengths"), + Err(_) => panic!("invalid PnFrame dimensions, plane lengths, or BITS value"), } } @@ -646,7 +653,7 @@ impl<'a> P010Frame<'a> { /// Cost: one O(plane_size) scan per plane. The default /// [`Self::try_new`] skips this so the hot path stays O(1). /// - /// Returns [`P010FrameError::SampleLowBitsSet`] on the first + /// Returns [`PnFrameError::SampleLowBitsSet`] on the first /// offending sample — carries the plane, element index, and /// offending value. #[cfg_attr(not(tarpaulin), inline(always))] @@ -657,8 +664,10 @@ impl<'a> P010Frame<'a> { height: u32, y_stride: u32, uv_stride: u32, - ) -> Result { + ) -> Result { let frame = Self::try_new(y, uv, width, height, y_stride, uv_stride)?; + let low_bits = 16 - BITS; + let low_mask: u16 = ((1u32 << low_bits) - 1) as u16; let w = width as usize; let h = height as usize; let uv_w = w; // interleaved: `width / 2` pairs × 2 elements @@ -666,11 +675,12 @@ impl<'a> P010Frame<'a> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s & 0x3F != 0 { - return Err(P010FrameError::SampleLowBitsSet { - plane: P010FramePlane::Y, + if s & low_mask != 0 { + return Err(PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Y, index: start + col, value: s, + low_bits, }); } } @@ -678,11 +688,12 @@ impl<'a> P010Frame<'a> { for row in 0..chroma_h { let start = row * uv_stride as usize; for (col, &s) in uv[start..start + uv_w].iter().enumerate() { - if s & 0x3F != 0 { - return Err(P010FrameError::SampleLowBitsSet { - plane: P010FramePlane::Uv, + if s & low_mask != 0 { + return Err(PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Uv, index: start + col, value: s, + low_bits, }); } } @@ -730,23 +741,51 @@ impl<'a> P010Frame<'a> { pub const fn uv_stride(&self) -> u32 { self.uv_stride } + + /// Active bit depth — 10 or 12. Mirrors the `BITS` const parameter + /// so generic code can read it without naming the type. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn bits(&self) -> u32 { + BITS + } } -/// Identifies which plane of a [`P010Frame`] a -/// [`P010FrameError::SampleLowBitsSet`] refers to. +/// Type alias for a validated P010 frame (10‑bit, high‑bit‑packed). +/// Use this name at call sites for readability. +pub type P010Frame<'a> = PnFrame<'a, 10>; + +/// Type alias for a validated P012 frame (12‑bit, high‑bit‑packed). +/// Same layout as [`P010Frame`] but with 12 active bits in the high +/// 12 of each `u16` (`sample = value << 4`, low 4 bits zero). +pub type P012Frame<'a> = PnFrame<'a, 12>; + +/// Identifies which plane of a [`PnFrame`] a +/// [`PnFrameError::SampleLowBitsSet`] refers to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)] -pub enum P010FramePlane { +pub enum PnFramePlane { /// Luma plane. Y, /// Interleaved UV plane. Uv, } -/// Errors returned by [`P010Frame::try_new`] and -/// [`P010Frame::try_new_checked`]. +/// Back‑compat alias for the pre‑generalization plane enum name. +pub type P010FramePlane = PnFramePlane; + +/// Errors returned by [`PnFrame::try_new`] and +/// [`PnFrame::try_new_checked`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)] #[non_exhaustive] -pub enum P010FrameError { +pub enum PnFrameError { + /// `BITS` was not one of the supported high‑bit‑packed depths + /// (10, 12). 14 exists in the planar `yuv420p14le` family but not + /// as a Pn hardware output; 16 would need a different kernel + /// family. + #[error("unsupported BITS ({bits}) for PnFrame; must be 10 or 12")] + UnsupportedBits { + /// The unsupported value of the `BITS` const parameter. + bits: u32, + }, /// `width` or `height` was zero. #[error("width ({width}) or height ({height}) is zero")] ZeroDimension { @@ -803,29 +842,34 @@ pub enum P010FrameError { /// Row count that overflowed against the stride. rows: u32, }, - /// A sample's low 6 bits were non‑zero — P010 packs its 10 active - /// bits in the high 10 of each `u16`, so valid samples are always - /// multiples of 64 (`value << 6`). Only - /// [`P010Frame::try_new_checked`] can produce this error. + /// A sample's low `16 - BITS` bits were non‑zero — a Pn sample + /// packs its `BITS` active bits in the high `BITS` of each `u16`, + /// so valid samples are always multiples of `1 << (16 - BITS)` + /// (64 for 10‑bit, 16 for 12‑bit). Only + /// [`PnFrame::try_new_checked`] can produce this error. /// /// Note: the absence of this error does **not** prove the buffer - /// is P010. A `yuv420p10le` buffer of samples that all happen to - /// be multiples of 64 (e.g. `Y = 64`, `UV = 512`) passes the - /// check silently. See [`P010Frame::try_new_checked`] for the - /// full discussion. + /// is Pn. A low‑bit‑packed buffer of samples that all happen to be + /// multiples of `1 << (16 - BITS)` passes the check silently. See + /// [`PnFrame::try_new_checked`] for the full discussion. #[error( - "sample {value:#06x} on plane {plane} at element {index} has non-zero low 6 bits (not a valid P010 sample)" + "sample {value:#06x} on plane {plane} at element {index} has non-zero low {low_bits} bits (not a valid Pn sample at the declared BITS)" )] SampleLowBitsSet { /// Which plane the offending sample lives on. - plane: P010FramePlane, + plane: PnFramePlane, /// Element index within that plane's slice. index: usize, /// The offending sample value. value: u16, + /// Number of low bits expected to be zero (`16 - BITS`). + low_bits: u32, }, } +/// Back‑compat alias for the pre‑generalization error enum name. +pub type P010FrameError = PnFrameError; + /// A validated NV21 (semi‑planar 4:2:0) frame. /// /// Structurally identical to [`Nv12Frame`] — one full-size luma plane @@ -1411,6 +1455,20 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { /// for readability. pub type Yuv420p10Frame<'a> = Yuv420pFrame16<'a, 10>; +/// Type alias for a validated YUV 4:2:0 planar frame at 12 bits per +/// sample (`AV_PIX_FMT_YUV420P12LE`). Tight wrapper over +/// [`Yuv420pFrame16`] with `BITS == 12` — same low‑bit‑packed `u16` +/// layout as [`Yuv420p10Frame`], just with 12 active bits in the +/// low 12 of each element (upper 4 bits zero). +pub type Yuv420p12Frame<'a> = Yuv420pFrame16<'a, 12>; + +/// Type alias for a validated YUV 4:2:0 planar frame at 14 bits per +/// sample (`AV_PIX_FMT_YUV420P14LE`). Tight wrapper over +/// [`Yuv420pFrame16`] with `BITS == 14` — same low‑bit‑packed `u16` +/// layout as [`Yuv420p10Frame`], just with 14 active bits in the +/// low 14 of each element (upper 2 bits zero). +pub type Yuv420p14Frame<'a> = Yuv420pFrame16<'a, 14>; + /// Errors returned by [`Yuv420pFrame16::try_new`]. Variant shape /// mirrors [`Yuv420pFrameError`], with `UnsupportedBits` added for /// the new `BITS` parameter and all sizes expressed in **samples** @@ -2163,28 +2221,28 @@ mod tests { fn p010_try_new_rejects_odd_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 15, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::OddWidth { width: 15 })); + assert!(matches!(e, PnFrameError::OddWidth { width: 15 })); } #[test] fn p010_try_new_rejects_zero_dim() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 0, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::ZeroDimension { .. })); + assert!(matches!(e, PnFrameError::ZeroDimension { .. })); } #[test] fn p010_try_new_rejects_y_stride_under_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 16, 8, 8, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YStrideTooSmall { .. })); + assert!(matches!(e, PnFrameError::YStrideTooSmall { .. })); } #[test] fn p010_try_new_rejects_uv_stride_under_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 8).unwrap_err(); - assert!(matches!(e, P010FrameError::UvStrideTooSmall { .. })); + assert!(matches!(e, PnFrameError::UvStrideTooSmall { .. })); } #[test] @@ -2192,7 +2250,7 @@ mod tests { let y = std::vec![0u16; 10]; let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::YPlaneTooShort { .. })); } #[test] @@ -2200,11 +2258,11 @@ mod tests { let y = std::vec![0u16; 16 * 8]; let uv = std::vec![0x8000u16; 8]; let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::UvPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::UvPlaneTooShort { .. })); } #[test] - #[should_panic(expected = "invalid P010Frame")] + #[should_panic(expected = "invalid PnFrame")] fn p010_new_panics_on_invalid() { let y = std::vec![0u16; 10]; let uv = std::vec![0x8000u16; 16 * 4]; @@ -2218,7 +2276,7 @@ mod tests { let y: [u16; 0] = []; let uv: [u16; 0] = []; let e = P010Frame::try_new(&y, &uv, big, big, big, big).unwrap_err(); - assert!(matches!(e, P010FrameError::GeometryOverflow { .. })); + assert!(matches!(e, PnFrameError::GeometryOverflow { .. })); } #[test] @@ -2239,7 +2297,7 @@ mod tests { let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); match e { - P010FrameError::SampleLowBitsSet { plane, value, .. } => { + PnFrameError::SampleLowBitsSet { plane, value, .. } => { assert_eq!(plane, P010FramePlane::Y); assert_eq!(value, 0x03FF); } @@ -2255,7 +2313,7 @@ mod tests { let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); assert!(matches!( e, - P010FrameError::SampleLowBitsSet { + PnFrameError::SampleLowBitsSet { plane: P010FramePlane::Uv, value: 0x0001, .. @@ -2268,7 +2326,7 @@ mod tests { let y = std::vec![0u16; 10]; // Too small. let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::YPlaneTooShort { .. })); } /// Regression documenting a **known limitation** of diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 54e7a07..f8ceba2 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -2075,4 +2075,156 @@ mod tests { } } } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON planar {BITS}-bit → u8 diverges"); + } + + fn check_planar_u16_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_neon, + "NEON planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn neon_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_neon_equivalence_n::<12>(16, m, full); + check_planar_u16_neon_equivalence_n::<12>(16, m, full); + check_pn_u8_neon_equivalence_n::<12>(16, m, full); + check_pn_u16_neon_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn neon_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_neon_equivalence_n::<14>(16, m, full); + check_planar_u16_neon_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn neon_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn neon_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_neon_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_neon_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 9a887e1..f0619a9 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -1796,4 +1796,159 @@ mod tests { check_p010_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "simd128 planar {BITS}-bit → u8 diverges" + ); + } + + fn check_planar_u16_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "simd128 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn simd128_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_simd128_equivalence_n::<12>(16, m, full); + check_planar_u16_simd128_equivalence_n::<12>(16, m, full); + check_pn_u8_simd128_equivalence_n::<12>(16, m, full); + check_pn_u16_simd128_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn simd128_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_simd128_equivalence_n::<14>(16, m, full); + check_planar_u16_simd128_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn simd128_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn simd128_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_simd128_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_simd128_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index c4e5a12..16deb67 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -1805,4 +1805,168 @@ mod tests { check_p010_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 planar {BITS}-bit → u8 diverges"); + } + + fn check_planar_u16_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX2 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn avx2_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx2_equivalence_n::<12>(32, m, full); + check_planar_u16_avx2_equivalence_n::<12>(32, m, full); + check_pn_u8_avx2_equivalence_n::<12>(32, m, full); + check_pn_u16_avx2_equivalence_n::<12>(32, m, full); + } + } + } + + #[test] + fn avx2_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx2_equivalence_n::<14>(32, m, full); + check_planar_u16_avx2_equivalence_n::<14>(32, m, full); + } + } + } + + #[test] + fn avx2_p12_matches_scalar_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_planar_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn avx2_p14_matches_scalar_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_planar_u8_avx2_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx2_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 6b0dbe9..238a09a 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1841,4 +1841,171 @@ mod tests { check_p010_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX-512 planar {BITS}-bit → u8 diverges" + ); + } + + fn check_planar_u16_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX-512 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn avx512_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx512_equivalence_n::<12>(64, m, full); + check_planar_u16_avx512_equivalence_n::<12>(64, m, full); + check_pn_u8_avx512_equivalence_n::<12>(64, m, full); + check_pn_u16_avx512_equivalence_n::<12>(64, m, full); + } + } + } + + #[test] + fn avx512_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx512_equivalence_n::<14>(64, m, full); + check_planar_u16_avx512_equivalence_n::<14>(64, m, full); + } + } + } + + #[test] + fn avx512_p12_matches_scalar_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_planar_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn avx512_p14_matches_scalar_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_planar_u8_avx512_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx512_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index b81a060..1dd5f2d 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1584,4 +1584,180 @@ mod tests { check_p010_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + // + // The helpers below parameterize over `const BITS: u32` so the same + // scalar-equivalence scaffolding covers 10/12/14 without duplicating + // the 16-pixel block seeding + diff harness. `<10>` is already + // exercised by the dedicated tests above; `<12>` / `<14>` add + // regression coverage for the new yuv420p12 / yuv420p14 / P012 + // kernels. 14-bit is planar-only (no P014 in Ship 4a). + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "SSE4.1 planar {BITS}-bit → u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + fn check_planar_u16_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "SSE4.1 planar {BITS}-bit → u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + fn check_pn_u8_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn sse41_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_sse41_equivalence_n::<12>(16, m, full); + check_planar_u16_sse41_equivalence_n::<12>(16, m, full); + check_pn_u8_sse41_equivalence_n::<12>(16, m, full); + check_pn_u16_sse41_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn sse41_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_sse41_equivalence_n::<14>(16, m, full); + check_planar_u16_sse41_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn sse41_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn sse41_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_sse41_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_sse41_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/mod.rs b/src/row/mod.rs index f6257fc..1201ac6 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -650,6 +650,451 @@ pub fn p010_to_rgb_u16_row( scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } +/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 12 active bits in the low 12 bits of each +/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed +/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The +/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching +/// `yuv420p12le` convention — upper 4 bits zero). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (14‑bit values in the low 14 of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ +/// packed — 12 active bits in the high 12 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and +/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but +/// `>> 4` instead of `>> 6` at each `u16` load. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** to **native‑depth `u16`** packed RGB +/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed +/// `yuv420p12le` convention, **not** P012's high‑bit packing). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. /// diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 92835a5..5877255 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -19,12 +19,15 @@ use thiserror::Error; use crate::{ HsvBuffers, PixelSink, SourceFormat, row::{ - nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, rgb_to_hsv_row, - yuv_420_to_rgb_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, + nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, + p012_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row, + yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, + yuv420p14_to_rgb_u16_row, }, yuv::{ - Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, Yuv420p, Yuv420p10, - Yuv420p10Row, Yuv420p10Sink, Yuv420pRow, Yuv420pSink, + Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, P012, P012Row, + P012Sink, Yuv420p, Yuv420p10, Yuv420p10Row, Yuv420p10Sink, Yuv420p12, Yuv420p12Row, + Yuv420p12Sink, Yuv420p14, Yuv420p14Row, Yuv420p14Sink, Yuv420pRow, Yuv420pSink, }, }; @@ -225,6 +228,35 @@ pub enum RowSlice { /// bits sit in the high 10 of its `u16`). #[display("UV Half 10")] UvHalf10, + /// Full‑width Y row of a **12‑bit** planar source ([`Yuv420p12`]). + /// `u16` samples, `width` elements, low‑bit‑packed. + #[display("Y12")] + Y12, + /// Half‑width U row of a **12‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("U Half 12")] + UHalf12, + /// Half‑width V row of a **12‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("V Half 12")] + VHalf12, + /// Half‑width interleaved UV row of a **12‑bit semi‑planar** source + /// ([`P012`]). `u16` samples, `width` elements (high‑bit‑packed: 12 + /// active bits in the high 12 of each `u16`). + #[display("UV Half 12")] + UvHalf12, + /// Full‑width Y row of a **14‑bit** planar source ([`Yuv420p14`]). + /// `u16` samples, `width` elements, low‑bit‑packed. + #[display("Y14")] + Y14, + /// Half‑width U row of a **14‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("U Half 14")] + UHalf14, + /// Half‑width V row of a **14‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("V Half 14")] + VHalf14, } /// A sink that writes any subset of `{RGB, Luma, HSV}` into @@ -1297,6 +1329,545 @@ impl PixelSink for MixedSinker<'_, P010> { } } +// ---- Yuv420p12 impl ---------------------------------------------------- + +impl<'a> MixedSinker<'a, Yuv420p12> { + /// Attaches a packed **`u16`** RGB output buffer. Mirrors + /// [`MixedSinker::with_rgb_u16`] but produces 12‑bit + /// output (values in `[0, 4095]` in the low 12 of each `u16`, upper + /// 4 zero). Length is measured in `u16` **elements** (`width × + /// height × 3`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl Yuv420p12Sink for MixedSinker<'_, Yuv420p12> {} + +impl PixelSink for MixedSinker<'_, Yuv420p12> { + type Input<'r> = Yuv420p12Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: Yuv420p12Row<'_>) -> Result<(), Self::Error> { + // Bit depth is fixed by the format (12) — declared as a const so + // the downshift for u8 luma stays obvious at the call site. + const BITS: u32 = 12; + + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y12, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.u_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf12, + row: idx, + expected: w / 2, + actual: row.u_half().len(), + }); + } + if row.v_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::VHalf12, + row: idx, + expected: w / 2, + actual: row.v_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> (BITS - 8)) as u8; + } + } + + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + yuv420p12_to_rgb_u16_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + yuv420p12_to_rgb_row( + row.y(), + row.u_half(), + row.v_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + +// ---- Yuv420p14 impl ---------------------------------------------------- + +impl<'a> MixedSinker<'a, Yuv420p14> { + /// Attaches a packed **`u16`** RGB output buffer. Produces 14‑bit + /// output (values in `[0, 16383]` in the low 14 of each `u16`, upper + /// 2 zero). Length is measured in `u16` **elements** (`width × + /// height × 3`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl Yuv420p14Sink for MixedSinker<'_, Yuv420p14> {} + +impl PixelSink for MixedSinker<'_, Yuv420p14> { + type Input<'r> = Yuv420p14Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: Yuv420p14Row<'_>) -> Result<(), Self::Error> { + const BITS: u32 = 14; + + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y14, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.u_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf14, + row: idx, + expected: w / 2, + actual: row.u_half().len(), + }); + } + if row.v_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::VHalf14, + row: idx, + expected: w / 2, + actual: row.v_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> (BITS - 8)) as u8; + } + } + + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + yuv420p14_to_rgb_u16_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + yuv420p14_to_rgb_row( + row.y(), + row.u_half(), + row.v_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + +// ---- P012 impl --------------------------------------------------------- + +impl<'a> MixedSinker<'a, P012> { + /// Attaches a packed **`u16`** RGB output buffer. Produces 12‑bit + /// output in **low‑bit‑packed** `yuv420p12le` convention (values in + /// `[0, 4095]` in the low 12 of each `u16`, upper 4 zero) — + /// **not** P012's high‑bit packing. Callers feeding a P012 consumer + /// must shift the output left by 4. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl P012Sink for MixedSinker<'_, P012> {} + +impl PixelSink for MixedSinker<'_, P012> { + type Input<'r> = P012Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error> { + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y12, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.uv_half().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UvHalf12, + row: idx, + expected: w, + actual: row.uv_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + // Luma: P012 samples are high‑bit‑packed (`value << 4`). Taking + // the high byte via `>> 8` gives the top 8 bits of the 12‑bit + // value — identical accessor to P010 (both put active bits in the + // high `BITS` positions of the `u16`). + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> 8) as u8; + } + } + + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + p012_to_rgb_u16_row( + row.y(), + row.uv_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + p012_to_rgb_row( + row.y(), + row.uv_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + /// Returns `Ok(())` iff the walker's frame dimensions exactly match /// the sinker's configured dimensions. Called from /// [`PixelSink::begin_frame`] on both `MixedSinker` and diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs index eedc2ab..b3f1f4c 100644 --- a/src/yuv/mod.rs +++ b/src/yuv/mod.rs @@ -10,20 +10,33 @@ //! chroma (Android MediaCodec default). //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits //! per sample (HDR10 / 10‑bit SDR software decode). +//! - [`Yuv420p12`](crate::yuv::Yuv420p12) — 4:2:0 planar at 12 bits +//! per sample (HEVC Main 12 / VP9 Profile 3 software decode). +//! - [`Yuv420p14`](crate::yuv::Yuv420p14) — 4:2:0 planar at 14 bits +//! per sample (grading / mastering pipelines). //! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per //! sample, high‑bit‑packed (HDR hardware decode: VideoToolbox, //! VA‑API, NVDEC, D3D11VA, Intel QSV). +//! - [`P012`](crate::yuv::P012) — 4:2:0 semi‑planar at 12 bits per +//! sample, high‑bit‑packed (HEVC Main 12 / VP9 Profile 3 hardware +//! decode). //! //! Other families land in follow-up commits. mod nv12; mod nv21; mod p010; +mod p012; mod yuv420p; mod yuv420p10; +mod yuv420p12; +mod yuv420p14; pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to}; pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to}; pub use p010::{P010, P010Row, P010Sink, p010_to}; +pub use p012::{P012, P012Row, P012Sink, p012_to}; pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to}; pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to}; +pub use yuv420p12::{Yuv420p12, Yuv420p12Row, Yuv420p12Sink, yuv420p12_to}; +pub use yuv420p14::{Yuv420p14, Yuv420p14Row, Yuv420p14Sink, yuv420p14_to}; diff --git a/src/yuv/p012.rs b/src/yuv/p012.rs new file mode 100644 index 0000000..b7b058e --- /dev/null +++ b/src/yuv/p012.rs @@ -0,0 +1,152 @@ +//! P012 — semi‑planar 4:2:0, 12‑bit, high‑bit‑packed +//! (`AV_PIX_FMT_P012LE`). +//! +//! Storage is a 2‑plane layout identical to [`super::P010`]: one full‑ +//! size Y plane plus one interleaved UV plane at half width and half +//! height. Sample width is `u16` with the 12 active bits in the +//! **high** 12 positions of each element (`sample = value << 4`), low +//! 4 bits zero. This is the 12‑bit sibling of Microsoft's P010 +//! convention and what HEVC Main 12 / VP9 Profile 3 hardware decoders +//! emit. +//! +//! Conversion semantics mirror [`super::P010`] on the layout side and +//! [`super::Yuv420p12`] on the Q‑math side: two consecutive Y rows +//! share one UV row (4:2:0), chroma is nearest‑neighbor upsampled in +//! registers inside the row primitive, and every SIMD backend shifts +//! each `u16` load right by 4 (= `16 - BITS` with `BITS == 12`) to +//! extract the 12‑bit value before running the same Q15 pipeline used +//! by [`super::P010`]. + +use crate::{ColorMatrix, PixelSink, SourceFormat, frame::P012Frame, sealed::Sealed}; + +/// Zero‑sized marker for the P012 source format. Used as the `F` type +/// parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct P012; + +impl Sealed for P012 {} +impl SourceFormat for P012 {} + +/// One output row of a P012 source handed to a [`P012Sink`]. +/// +/// Accessors: +/// - [`y`](Self::y) — full‑width Y row (`width` `u16` samples, high‑ +/// bit‑packed). +/// - [`uv_half`](Self::uv_half) — **interleaved, half‑width** UV row +/// (`width` `u16` elements = `width / 2` U/V pairs, U first). The +/// row primitive deinterleaves and upsamples in‑register. +/// - [`row`](Self::row) — output row index (`0 ..= frame.height() - 1`). +/// - [`matrix`](Self::matrix), [`full_range`](Self::full_range) — +/// carried through from the kernel call. +#[derive(Debug, Clone, Copy)] +pub struct P012Row<'a> { + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> P012Row<'a> { + /// Bundles one row of a P012 source for a [`P012Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + pub(crate) fn new( + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + uv_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples, high‑bit‑packed + /// (12 active bits in the high 12 of each element). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Interleaved UV row — `width` `u16` elements laid out as + /// `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. Each element is + /// high‑bit‑packed. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn uv_half(&self) -> &'a [u16] { + self.uv_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit, + /// scaled into the high 12 bits of each `u16`); `false` for limited + /// range. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume P012 rows. +/// +/// A subtrait of [`PixelSink`] that pins the row shape to +/// [`P012Row`]. Implementors get +/// `process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error>` +/// via the supertrait. +pub trait P012Sink: for<'a> PixelSink = P012Row<'a>> {} + +/// Converts a P012 frame by walking its rows and feeding each one to +/// the [`P012Sink`]. +/// +/// The kernel is a pure row walker — no color arithmetic happens +/// here. Slice math picks the Y row and the correct UV row for each +/// output row (`chroma_row = row / 2` for 4:2:0) and hands borrows to +/// the Sink. The Sink decides what to derive and where to write. +pub fn p012_to( + src: &P012Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let uv_stride = src.uv_stride() as usize; + // UV row payload is `width` `u16` elements — `width / 2` interleaved + // U/V pairs. + let uv_row_elems = w; + + let y_plane = src.y(); + let uv_plane = src.uv(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + // 4:2:0 chroma subsampling: two consecutive Y rows share one UV + // row. + let chroma_row = row / 2; + let uv_start = chroma_row * uv_stride; + let uv_half = &uv_plane[uv_start..uv_start + uv_row_elems]; + + sink.process(P012Row::new(y, uv_half, row, matrix, full_range))?; + } + Ok(()) +} diff --git a/src/yuv/yuv420p12.rs b/src/yuv/yuv420p12.rs new file mode 100644 index 0000000..5995c6c --- /dev/null +++ b/src/yuv/yuv420p12.rs @@ -0,0 +1,161 @@ +//! YUV 4:2:0 planar 12‑bit (`AV_PIX_FMT_YUV420P12LE`). +//! +//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size +//! plus U / V at half width and half height — with **`u16`** samples +//! (12 active bits in the **low** 12 of each element, upper 4 zero). +//! The [`Yuv420p12Frame`] type alias pins the bit depth; the underlying +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same +//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` runs +//! unchanged against the 12‑bit instantiation. +//! +//! Ships in colconv v0.2a alongside [`super::Yuv420p14`] and +//! [`super::P012`]. Kernel semantics match [`super::Yuv420p10`]: two +//! consecutive Y rows share one chroma row (4:2:0), chroma is +//! nearest‑neighbor upsampled in registers inside the row primitive, +//! and Q15 intermediates stay in i32 (chroma_sum < 10⁹ < i32 max at 12 +//! bits — verified against the scalar reference per SIMD backend). + +use crate::{ + ColorMatrix, PixelSink, SourceFormat, + frame::{Yuv420p12Frame, Yuv420pFrame16}, + sealed::Sealed, +}; + +/// Zero‑sized marker for the YUV 4:2:0 **12‑bit** source format. Used +/// as the `F` type parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct Yuv420p12; + +impl Sealed for Yuv420p12 {} +impl SourceFormat for Yuv420p12 {} + +/// One output row of a 12‑bit YUV 4:2:0 source handed to a +/// [`Yuv420p12Sink`]. Structurally identical to [`super::Yuv420p10Row`], +/// just with values in `[0, 4095]` instead of `[0, 1023]`. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420p12Row<'a> { + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> Yuv420p12Row<'a> { + /// Bundles one row of a 12‑bit 4:2:0 source for a [`Yuv420p12Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + u_half, + v_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Half‑width U (Cb) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn u_half(&self) -> &'a [u16] { + self.u_half + } + + /// Half‑width V (Cr) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn v_half(&self) -> &'a [u16] { + self.v_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit); + /// `false` for limited range (`[256, 3760]` luma, `[256, 3840]` + /// chroma — the 8‑bit `[16, 235]` / `[16, 240]` ranges scaled by 16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume 12‑bit YUV 4:2:0 rows. +pub trait Yuv420p12Sink: for<'a> PixelSink = Yuv420p12Row<'a>> {} + +/// Converts a 12‑bit YUV 4:2:0 frame by walking its rows and feeding +/// each one to the [`Yuv420p12Sink`]. Mirrors [`super::yuv420p10_to`] — +/// pure row walker, all color arithmetic happens inside the Sink via +/// the crate's row primitives instantiated at `BITS == 12`. +pub fn yuv420p12_to( + src: &Yuv420p12Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + yuv420p12_walker::<12, S>(src, full_range, matrix, sink) +} + +/// Row walker for the 12‑bit YUV 4:2:0 source. `BITS` is a const +/// generic so [`Yuv420pFrame16`] geometry reads (stride, plane +/// slicing) are monomorphized; the row/sink types bound below are +/// still pinned to the 12‑bit variants. +#[cfg_attr(not(tarpaulin), inline(always))] +fn yuv420p12_walker( + src: &Yuv420pFrame16<'_, BITS>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let u_stride = src.u_stride() as usize; + let v_stride = src.v_stride() as usize; + let chroma_width = w / 2; + + let y_plane = src.y(); + let u_plane = src.u(); + let v_plane = src.v(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + let chroma_row = row / 2; + let u_start = chroma_row * u_stride; + let v_start = chroma_row * v_stride; + let u_half = &u_plane[u_start..u_start + chroma_width]; + let v_half = &v_plane[v_start..v_start + chroma_width]; + + sink.process(Yuv420p12Row::new( + y, u_half, v_half, row, matrix, full_range, + ))?; + } + Ok(()) +} diff --git a/src/yuv/yuv420p14.rs b/src/yuv/yuv420p14.rs new file mode 100644 index 0000000..27c54ee --- /dev/null +++ b/src/yuv/yuv420p14.rs @@ -0,0 +1,159 @@ +//! YUV 4:2:0 planar 14‑bit (`AV_PIX_FMT_YUV420P14LE`). +//! +//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size +//! plus U / V at half width and half height — with **`u16`** samples +//! (14 active bits in the **low** 14 of each element, upper 2 zero). +//! The [`Yuv420p14Frame`] type alias pins the bit depth; the underlying +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same +//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` / +//! `Yuv420p12` runs unchanged against the 14‑bit instantiation. +//! +//! Kernel math constraint: at 14 bits, chroma_sum still fits in i32 +//! (~10⁹ ≤ 2³¹), so the Q15 pipeline stays unchanged. 16‑bit would +//! overflow and needs a separate kernel family. + +use crate::{ + ColorMatrix, PixelSink, SourceFormat, + frame::{Yuv420p14Frame, Yuv420pFrame16}, + sealed::Sealed, +}; + +/// Zero‑sized marker for the YUV 4:2:0 **14‑bit** source format. Used +/// as the `F` type parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct Yuv420p14; + +impl Sealed for Yuv420p14 {} +impl SourceFormat for Yuv420p14 {} + +/// One output row of a 14‑bit YUV 4:2:0 source handed to a +/// [`Yuv420p14Sink`]. Structurally identical to [`super::Yuv420p10Row`], +/// just with values in `[0, 16383]` instead of `[0, 1023]`. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420p14Row<'a> { + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> Yuv420p14Row<'a> { + /// Bundles one row of a 14‑bit 4:2:0 source for a [`Yuv420p14Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + u_half, + v_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Half‑width U (Cb) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn u_half(&self) -> &'a [u16] { + self.u_half + } + + /// Half‑width V (Cr) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn v_half(&self) -> &'a [u16] { + self.v_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 16383]` for + /// 14‑bit); `false` for limited range (`[1024, 15040]` luma, + /// `[1024, 15360]` chroma — the 8‑bit `[16, 235]` / `[16, 240]` + /// ranges scaled by 64). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume 14‑bit YUV 4:2:0 rows. +pub trait Yuv420p14Sink: for<'a> PixelSink = Yuv420p14Row<'a>> {} + +/// Converts a 14‑bit YUV 4:2:0 frame by walking its rows and feeding +/// each one to the [`Yuv420p14Sink`]. Mirrors [`super::yuv420p10_to`] — +/// pure row walker, all color arithmetic happens inside the Sink via +/// the crate's row primitives instantiated at `BITS == 14`. +pub fn yuv420p14_to( + src: &Yuv420p14Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + yuv420p14_walker::<14, S>(src, full_range, matrix, sink) +} + +/// Row walker for the 14‑bit YUV 4:2:0 source. `BITS` is a const +/// generic so [`Yuv420pFrame16`] geometry reads (stride, plane +/// slicing) are monomorphized; the row/sink types bound below are +/// still pinned to the 14‑bit variants. +#[cfg_attr(not(tarpaulin), inline(always))] +fn yuv420p14_walker( + src: &Yuv420pFrame16<'_, BITS>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let u_stride = src.u_stride() as usize; + let v_stride = src.v_stride() as usize; + let chroma_width = w / 2; + + let y_plane = src.y(); + let u_plane = src.u(); + let v_plane = src.v(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + let chroma_row = row / 2; + let u_start = chroma_row * u_stride; + let v_start = chroma_row * v_stride; + let u_half = &u_plane[u_start..u_start + chroma_width]; + let v_half = &v_plane[v_start..v_start + chroma_width]; + + sink.process(Yuv420p14Row::new( + y, u_half, v_half, row, matrix, full_range, + ))?; + } + Ok(()) +} From d044da1341bfc314ae17268e2d26422fc04902fe Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 19 Apr 2026 23:55:39 +1200 Subject: [PATCH 4/4] more simd backend --- src/frame.rs | 172 +++++++++---- src/lib.rs | 47 +++- src/row/arch/neon.rs | 17 +- src/row/arch/wasm_simd128.rs | 22 +- src/row/arch/x86_avx2.rs | 22 +- src/row/arch/x86_avx512.rs | 22 +- src/row/arch/x86_sse41.rs | 21 +- src/row/scalar.rs | 27 ++ src/sinker/mixed.rs | 474 ++++++++++++++++++++++++++++++++++- src/sinker/mod.rs | 9 +- src/yuv/mod.rs | 27 +- src/yuv/yuv420p10.rs | 23 +- 12 files changed, 770 insertions(+), 113 deletions(-) diff --git a/src/frame.rs b/src/frame.rs index 2c1997b..a56353f 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -496,28 +496,28 @@ pub enum Nv12FrameError { /// /// # Input sample range and packing sanity /// -/// Each `u16` sample's 10 active bits live in the high 10 positions; -/// the low 6 bits are expected to be zero. [`Self::try_new`] validates -/// geometry only. +/// Each `u16` sample's `BITS` active bits live in the high `BITS` +/// positions; the low `16 - BITS` bits are expected to be zero. +/// [`Self::try_new`] validates geometry only. /// /// [`Self::try_new_checked`] additionally scans every sample and -/// rejects any with non‑zero low 6 bits — a **necessary but not -/// sufficient** packing sanity check. It catches mispacked -/// `yuv420p10le` buffers as long as **at least one** sample has -/// low‑bit content (the usual case for noisy real‑world image data), -/// but it **cannot distinguish** P010 from a `yuv420p10le` buffer -/// whose samples all happen to be multiples of 64. Values like -/// `Y = 64` (limited‑range black) and `UV = 512` (neutral chroma) -/// both have low 6 bits zero and so pass the check, even though the -/// buffer layout is wrong. For strict provenance, callers must rely -/// on their source format metadata and pick the right frame type -/// ([`P010Frame`] vs [`Yuv420p10Frame`]) at construction. +/// rejects any with non‑zero low `16 - BITS` bits — a **necessary +/// but not sufficient** packing sanity check. Its catch rate +/// weakens as `BITS` grows: at `BITS == 10` it rejects 63/64 random +/// samples and is a strong signal; at `BITS == 12` it only rejects +/// 15/16, and **common flat‑region values in decoder output are +/// exactly the ones that slip through** (`Y = 256/1024` limited +/// black, `UV = 2048` neutral chroma are all multiples of 16 in +/// both layouts). See [`Self::try_new_checked`] for the full +/// table. For strict provenance, callers must rely on their source +/// format metadata and pick the right frame type ([`PnFrame`] vs +/// [`Yuv420pFrame16`]) at construction. /// -/// Kernels shift each load right by 6 to extract the 10‑bit value, -/// so mispacked input (e.g. a `yuv420p10le` buffer handed to the -/// P010 kernel) produces deterministic, backend‑independent output -/// — wrong colors, but consistently wrong across scalar + every -/// SIMD backend, which is visible in any output diff. +/// Kernels shift each load right by `16 - BITS` to extract the +/// active value, so mispacked input (e.g. a `yuv420p12le` buffer +/// handed to the P012 kernel) produces deterministic, backend‑ +/// independent output — wrong colors, but consistently wrong across +/// scalar + every SIMD backend, which is visible in any output diff. #[derive(Debug, Clone, Copy)] pub struct PnFrame<'a, const BITS: u32> { y: &'a [u16], @@ -633,29 +633,44 @@ impl<'a, const BITS: u32> PnFrame<'a, BITS> { } /// Like [`Self::try_new`] but additionally scans every sample and - /// rejects any whose **low 6 bits** are non‑zero. A valid P010 - /// sample has its 10 active bits in the high 10 positions and zero - /// below, so non‑zero low bits is evidence the buffer isn't P010. + /// rejects any whose **low `16 - BITS` bits** are non‑zero. A valid + /// high‑bit‑packed sample has its `BITS` active bits in the high + /// `BITS` positions and zero below, so non‑zero low bits is + /// evidence the buffer isn't Pn‑shaped. /// /// **This is a packing sanity check, not a provenance validator.** - /// The check catches noisy `yuv420p10le` data (where most samples - /// have low‑bit content), but it **cannot** distinguish P010 from - /// a `yuv420p10le` buffer whose samples all happen to be multiples - /// of 64. Common flat‑region values like `Y = 64` (limited‑range - /// black) or `UV = 512` (neutral chroma) are multiples of 64 in - /// both layouts, so a yuv420p10le buffer of flat content will - /// silently pass this check. Callers who need strict provenance - /// must rely on their source format metadata and pick the right - /// frame type at construction ([`P010Frame`] vs [`Yuv420p10Frame`]); - /// no runtime check on opaque `u16` data can reliably tell the two - /// layouts apart. + /// The check catches noisy low‑bit‑packed data (where most samples + /// have low‑bit content), but it **cannot** distinguish Pn from a + /// low‑bit‑packed buffer whose samples all happen to be multiples + /// of `1 << (16 - BITS)`. The catch rate scales with `BITS`: + /// + /// - `BITS == 10` (P010): 6 low bits must be zero. Random u16 + /// samples pass with probability `1/64`; noisy `yuv420p10le` + /// data is almost always caught. + /// - `BITS == 12` (P012): only 4 low bits. Pass probability is + /// `1/16` — 4× weaker. **Common limited‑range flat‑region values + /// (`Y = 256` limited black, `UV = 2048` neutral chroma, + /// `Y = 1024` full black) are all multiples of 16 in both + /// layouts**, so flat `yuv420p12le` content passes **every + /// time**. The `>> 4` extraction in the Pn kernels then + /// discards the real signal and produces badly darkened + /// output. For P012, prefer format metadata over this check. + /// + /// Callers who need strict provenance must rely on their source + /// format metadata and pick the right frame type at construction + /// ([`PnFrame`] vs [`Yuv420pFrame16`]); no runtime check on opaque + /// `u16` data can reliably tell the two layouts apart, and the + /// weakness is proportionally worse the higher the `BITS` value. + /// The regression test + /// `p012_try_new_checked_accepts_low_packed_flat_content_by_design` + /// in `frame::tests` pins this limitation in code. /// /// Cost: one O(plane_size) scan per plane. The default /// [`Self::try_new`] skips this so the hot path stays O(1). /// /// Returns [`PnFrameError::SampleLowBitsSet`] on the first - /// offending sample — carries the plane, element index, and - /// offending value. + /// offending sample — carries the plane, element index, offending + /// value, and the number of low bits expected to be zero. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( y: &'a [u16], @@ -1138,11 +1153,13 @@ pub enum Nv21FrameError { /// [`Self::try_new_checked`] — it scans every sample and returns /// [`Yuv420pFrame16Error::SampleOutOfRange`] on the first violation. /// -/// colconv v0.2 ships `BITS == 10` only (the use‑case keystone for -/// HDR and 10‑bit SDR). 12 and 14 are mechanical follow‑ups that -/// just relax the constructor's `BITS` check and add tiered aliases -/// — the kernel math (Q15 coefficients + i32 intermediates) works -/// unchanged across all three, derived at compile time from `BITS`. +/// All three supported depths — `BITS == 10` (HDR10 / 10‑bit SDR +/// keystone), `BITS == 12` (HEVC Main 12 / VP9 Profile 3), and +/// `BITS == 14` (grading / mastering pipelines) — share the same +/// scalar + SIMD kernel family. The Q15 coefficients + i32 +/// intermediates work unchanged across all three, derived at +/// compile time from `BITS`; the constructor validates the `BITS` +/// value against the set `{10, 12, 14}` up front. /// /// 16‑bit input (which would overflow the i32 chroma sum in the /// Q15 path) is **not** represented by this type — it needs a @@ -1174,8 +1191,9 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { /// lengths, and the `BITS` parameter. /// /// Returns [`Yuv420pFrame16Error`] if any of: - /// - `BITS` is not 10, 12, or 14 (colconv v0.2 additionally rejects - /// 12/14 at the type alias layer — see [`Yuv420p10Frame`]), + /// - `BITS` is not 10, 12, or 14 — use [`Yuv420p10Frame`], + /// [`Yuv420p12Frame`], or [`Yuv420p14Frame`] at call sites for + /// readability, all three are type aliases over this struct, /// - `width` or `height` is zero, /// - `width` is odd, /// - any stride is smaller than the plane's declared pixel width, @@ -2358,4 +2376,74 @@ mod tests { // source values). That's accepted behavior — the type system, // not `try_new_checked`, is what keeps yuv420p10le out of P010. } + + #[test] + fn p012_try_new_checked_accepts_shifted_samples() { + // Valid P012 samples: low 4 bits zero (12-bit value << 4). + let y = std::vec![(2048u16) << 4; 16 * 8]; // 12-bit mid-gray shifted up + let uv = std::vec![(2048u16) << 4; 16 * 4]; + P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).expect("shifted samples valid"); + } + + #[test] + fn p012_try_new_checked_rejects_low_bits_set() { + // A Y sample with any of the low 4 bits set — e.g. yuv420p12le + // value 0x0ABC landing where P012 expects `value << 4`. The check + // catches samples like this that are obviously mispacked. + let mut y = std::vec![(2048u16) << 4; 16 * 8]; + y[3 * 16 + 5] = 0x0ABC; // low 4 bits = 0xC ≠ 0 + let uv = std::vec![(2048u16) << 4; 16 * 4]; + let e = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + match e { + PnFrameError::SampleLowBitsSet { + plane, + value, + low_bits, + .. + } => { + assert_eq!(plane, PnFramePlane::Y); + assert_eq!(value, 0x0ABC); + assert_eq!(low_bits, 4); + } + other => panic!("expected SampleLowBitsSet, got {other:?}"), + } + } + + /// Regression documenting a **worse known limitation** of + /// [`P012Frame::try_new_checked`] compared to P010: because the + /// low‑bits check only has 4 bits to work with at `BITS == 12`, + /// every multiple‑of‑16 `yuv420p12le` value passes silently. The + /// practical impact is that common limited‑range flat‑region + /// content in real decoder output — `Y = 256` (limited‑range + /// black), `UV = 2048` (neutral chroma), `Y = 1024` (full black) + /// — is entirely invisible to this check. + /// + /// This test pins the limitation with a reproducible input so + /// that: + /// 1. Users reading the test suite can see the exact failure + /// mode for `try_new_checked` on 12‑bit data. + /// 2. Any future attempt to strengthen `try_new_checked` (e.g., + /// into a statistical provenance heuristic) has a concrete + /// input to validate against. + /// 3. The `PnFrame` docs' warning about this limitation has a + /// named test to point to. + /// + /// For P012, the type system (choosing [`P012Frame`] vs + /// [`Yuv420p12Frame`] at construction based on decoder metadata) + /// is the only reliable provenance guarantee. + #[test] + fn p012_try_new_checked_accepts_low_packed_flat_content_by_design() { + // All values are multiples of 16 — exactly the set that slips + // through a 4-low-bits-zero check. `yuv420p12le` limited-range + // black and neutral chroma both satisfy this. + let y = std::vec![0x0100u16; 16 * 8]; // Y = 256 (limited-range black), multiple of 16 + let uv = std::vec![0x0800u16; 16 * 4]; // UV = 2048 (neutral chroma), multiple of 16 + let f = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("known limitation: 4-low-bits-zero check cannot tell yuv420p12le from P012"); + assert_eq!(f.width(), 16); + // Downstream P012 kernels would extract `>> 4` — giving Y=16 and + // UV=128 instead of the intended Y=256 and UV=2048. Silent color + // corruption. The type system, not `try_new_checked`, must + // guarantee provenance for 12-bit. + } } diff --git a/src/lib.rs b/src/lib.rs index ec97890..40b295d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,8 +24,46 @@ //! [`with_hsv`](sinker::MixedSinker::with_hsv) to select which channels //! to derive. //! -//! The crate design also follows a per-format expansion plan with -//! defined implementation priority tiers for the conversion kernels. +//! # Supported source formats +//! +//! Shipped (all 4:2:0 subsampling): +//! +//! | Family | Bit depth | Packing | FFmpeg name | +//! | ---------------- | --------- | ---------------------- | --------------------- | +//! | [`Yuv420p`] | 8 | planar | `yuv420p` | +//! | [`Nv12`] | 8 | semi-planar UV | `nv12` | +//! | [`Nv21`] | 8 | semi-planar VU | `nv21` | +//! | [`Yuv420p10`] | 10 | planar, low-packed | `yuv420p10le` | +//! | [`Yuv420p12`] | 12 | planar, low-packed | `yuv420p12le` | +//! | [`Yuv420p14`] | 14 | planar, low-packed | `yuv420p14le` | +//! | [`P010`] | 10 | semi-planar, high-packed | `p010le` | +//! | [`P012`] | 12 | semi-planar, high-packed | `p012le` | +//! +//! Not yet shipped (follow-up): +//! +//! - **16‑bit families** (`Yuv420p16` / `P016`) — require a separate +//! kernel family because the Q15 chroma_sum overflows i32 at +//! `BITS == 16`. Current scalar / SIMD kernels `debug_assert!` out +//! `BITS == 16` precisely to surface this. +//! - **4:2:2 and 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`, +//! `Nv42`) — share the Q15 math but need their own row walkers +//! for the different chroma subsampling / stride. +//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`, +//! `Rgba1010102`, etc.). +//! +//! See [`yuv`] for the per-format module-level breakdown and +//! [`frame`] for the validated frame types plus the `BITS` const +//! generic on the high-bit-depth families (`Yuv420pFrame16` +//! and `PnFrame`). +//! +//! [`Yuv420p`]: crate::yuv::Yuv420p +//! [`Nv12`]: crate::yuv::Nv12 +//! [`Nv21`]: crate::yuv::Nv21 +//! [`Yuv420p10`]: crate::yuv::Yuv420p10 +//! [`Yuv420p12`]: crate::yuv::Yuv420p12 +//! [`Yuv420p14`]: crate::yuv::Yuv420p14 +//! [`P010`]: crate::yuv::P010 +//! [`P012`]: crate::yuv::P012 #![cfg_attr(not(feature = "std"), no_std)] #![cfg_attr(docsrs, feature(doc_cfg))] @@ -167,8 +205,9 @@ pub trait PixelSink { } /// Consume one input unit. Called by the kernel once per unit (one - /// row, for the row-granular kernels v0.1 ships). Input borrows may - /// be invalidated after the call returns — implementations must not + /// row, for the row-granular kernels currently shipped). Input + /// borrows may be invalidated after the call returns — + /// implementations must not /// retain them. /// /// Returns `Err` to short-circuit the walker: on the first `Err`, diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index f8ceba2..878d5e3 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -643,17 +643,20 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// **native‑depth `u16`** RGB (low‑bit‑packed output, /// `yuv420p10le` / `yuv420p12le` convention — not P010/P012). /// -/// Same structure as [`p010_to_rgb_row`] up to the chroma compute; -/// the only differences are: -/// - `range_params_n::<10, 10>` → larger scales targeting the 10‑bit -/// output range. -/// - Clamp is explicit min/max to `[0, 1023]` via -/// [`clamp_u10`](crate::row::arch::neon::clamp_u10). +/// Same structure as [`super::neon::p_n_to_rgb_row`] up to the +/// chroma compute; the only differences are: +/// - `range_params_n::` → larger scales targeting the +/// native‑depth output range. +/// - Clamp is explicit min/max to `[0, (1 << BITS) - 1]` via +/// [`clamp_u10`](crate::row::arch::neon::clamp_u10) — the helper +/// name is historical; the actual max is derived from `BITS` at +/// the call site (1023 for P010, 4095 for P012). /// - Writes use two `vst3q_u16` calls per 16‑pixel block. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index f0619a9..2a54fbd 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -500,18 +500,22 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { } } -/// WASM simd128 P010 → packed **8‑bit** RGB. +/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → +/// packed **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`u16x8_shr(_, 6)`) instead of -/// AND‑masked. +/// [`super::wasm_simd128::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`u16x8_shr`, with +/// the shift amount computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_wasm`] /// (two `u8x16_swizzle` + two `i8x16_shuffle` combines). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -621,12 +625,14 @@ pub(crate) unsafe fn p_n_to_rgb_row( } } -/// WASM simd128 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → +/// packed **native‑depth `u16`** RGB (low‑bit‑packed output, +/// `yuv420pNle` convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 16deb67..10258c2 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -527,19 +527,23 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) } } -/// AVX2 P010 → packed **8‑bit** RGB. +/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 32 Y pixels / 16 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`_mm256_srli_epi16::<6>`) -/// instead of AND‑masked. +/// [`super::x86_avx2::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`_mm256_srl_epi16`, +/// with a shift count computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx2`] /// (two `_mm256_shuffle_epi8` + two `_mm256_permute4x64_epi64` + /// two `_mm256_permute2x128_si256` per 32 chroma elements). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -660,12 +664,14 @@ pub(crate) unsafe fn p_n_to_rgb_row( } } -/// AVX2 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 238a09a..3925276 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -572,12 +572,15 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u } } -/// AVX‑512 P010 → packed **8‑bit** RGB. +/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 64 Y pixels / 32 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`_mm512_srli_epi16::<6>`) -/// instead of AND‑masked. +/// [`super::x86_avx512::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`_mm512_srl_epi16`, +/// with a shift count computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx512`] /// — per‑128‑lane shuffle + 64‑bit permute + cross‑vector /// `_mm512_permutex2var_epi64` to produce 32‑sample U and V @@ -585,7 +588,8 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -707,12 +711,14 @@ pub(crate) unsafe fn p_n_to_rgb_row( } } -/// AVX‑512 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 1dd5f2d..75796bf 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -193,12 +193,14 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } -/// SSE4.1 P010 → packed **8‑bit** RGB. +/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences -/// from [`yuv420p10_to_rgb_row`]: -/// - Samples are shifted right by 6 (`_mm_srli_epi16::<6>`) instead -/// of AND‑masked — P010's 10 active bits live in the HIGH 10 of +/// from [`super::x86_sse41::yuv_420p_n_to_rgb_row`]: +/// - Samples are shifted right by `16 - BITS` (`_mm_srl_epi16`, with +/// a shift count computed from `BITS` once per call) instead of +/// AND‑masked — Pn's `BITS` active bits live in the HIGH `BITS` of /// each `u16`. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16`] /// below (one `_mm_shuffle_epi8` + two 64‑bit unpacks per 16 @@ -206,7 +208,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -320,12 +323,14 @@ pub(crate) unsafe fn p_n_to_rgb_row( } } -/// SSE4.1 P010 → packed **10‑bit `u16`** RGB (native‑depth, -/// low‑bit‑packed — `yuv420p10le` convention). +/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::<10>`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 26759c9..8d45b48 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -207,6 +207,13 @@ pub(crate) fn yuv_420p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // Low-bit-packed planar kernels are defined for BITS in {10, 12, 14}. + // 16 would overflow the Q15 chroma sum; 8 belongs to the non- + // const-generic `yuv_420_to_rgb_row` family. + debug_assert!( + BITS == 10 || BITS == 12 || BITS == 14, + "yuv_420p_n_to_rgb_row only supports BITS in {{10, 12, 14}}" + ); debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); @@ -300,6 +307,12 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // Same BITS range as the u8-output counterpart. See + // `yuv_420p_n_to_rgb_row` for the rationale. + debug_assert!( + BITS == 10 || BITS == 12 || BITS == 14, + "yuv_420p_n_to_rgb_u16_row only supports BITS in {{10, 12, 14}}" + ); debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); @@ -373,6 +386,14 @@ pub(crate) fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // High-bit-packed Pn kernels are only defined for BITS in {10, 12}. + // Outside that set, `16 - BITS` could under/overflow and the Q15 + // coefficient table has no corresponding entry. Caught here before + // the SIMD dispatcher hands control to unsafe code. + debug_assert!( + BITS == 10 || BITS == 12, + "p_n_to_rgb_row only supports BITS in {{10, 12}}" + ); debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); @@ -443,6 +464,12 @@ pub(crate) fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // See `p_n_to_rgb_row` for the BITS range rationale. Duplicated + // here so either entry point catches misuse on its own. + debug_assert!( + BITS == 10 || BITS == 12, + "p_n_to_rgb_u16_row only supports BITS in {{10, 12}}" + ); debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 5877255..4d210a7 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -2,9 +2,17 @@ //! written into my own buffers" consumer. //! //! Generic over the source format via an `F: SourceFormat` type -//! parameter. One `PixelSink` impl per supported format; v0.1 ships -//! the [`Yuv420p`](crate::yuv::Yuv420p), -//! [`Nv12`](crate::yuv::Nv12), and [`Nv21`](crate::yuv::Nv21) impls. +//! parameter. One `PixelSink` impl per supported format. Currently +//! ships impls for: +//! +//! - 8‑bit 4:2:0: [`Yuv420p`](crate::yuv::Yuv420p), +//! [`Nv12`](crate::yuv::Nv12), [`Nv21`](crate::yuv::Nv21). +//! - 10/12/14‑bit planar 4:2:0: [`Yuv420p10`](crate::yuv::Yuv420p10), +//! [`Yuv420p12`](crate::yuv::Yuv420p12), +//! [`Yuv420p14`](crate::yuv::Yuv420p14). +//! - 10/12‑bit semi‑planar high‑bit‑packed 4:2:0: +//! [`P010`](crate::yuv::P010), [`P012`](crate::yuv::P012). +//! //! All configuration and processing methods are fallible — no panics //! under normal contract violations — so the sink is usable on //! `panic = "abort"` targets. @@ -228,8 +236,12 @@ pub enum RowSlice { /// bits sit in the high 10 of its `u16`). #[display("UV Half 10")] UvHalf10, - /// Full‑width Y row of a **12‑bit** planar source ([`Yuv420p12`]). - /// `u16` samples, `width` elements, low‑bit‑packed. + /// Full‑width Y row of a **12‑bit** source — used for both the + /// planar ([`Yuv420p12`], low‑bit‑packed) and semi‑planar + /// ([`P012`], high‑bit‑packed) families. `u16` samples, `width` + /// elements. The packing direction depends on the source format; + /// the row‑shape check only verifies length, so a single variant + /// covers both. #[display("Y12")] Y12, /// Half‑width U row of a **12‑bit** planar source. `u16` samples, @@ -276,10 +288,9 @@ pub enum RowSlice { /// # Type parameter /// /// `F` identifies the source format — `Yuv420p`, `Nv12`, `Nv21`, -/// `Bgr24`, etc. Each format provides its own -/// `impl PixelSink for MixedSinker<'_, F>`. v0.1 ships impls for -/// [`Yuv420p`](crate::yuv::Yuv420p), [`Nv12`](crate::yuv::Nv12), and -/// [`Nv21`](crate::yuv::Nv21). +/// `Yuv420p10`, `Yuv420p12`, `Yuv420p14`, `P010`, `P012`, etc. Each +/// format provides its own `impl PixelSink for MixedSinker<'_, F>`. +/// See the module‑level docs for the full list of shipped impls. pub struct MixedSinker<'a, F: SourceFormat> { rgb: Option<&'a mut [u8]>, rgb_u16: Option<&'a mut [u16]>, @@ -1905,8 +1916,13 @@ mod tests { use super::*; use crate::{ ColorMatrix, - frame::{Nv12Frame, Nv21Frame, P010Frame, Yuv420p10Frame, Yuv420pFrame}, - yuv::{nv12_to, nv21_to, p010_to, yuv420p_to, yuv420p10_to}, + frame::{ + Nv12Frame, Nv21Frame, P010Frame, P012Frame, Yuv420p10Frame, Yuv420p12Frame, Yuv420p14Frame, + Yuv420pFrame, + }, + yuv::{ + nv12_to, nv21_to, p010_to, p012_to, yuv420p_to, yuv420p10_to, yuv420p12_to, yuv420p14_to, + }, }; fn solid_yuv420p_frame( @@ -3102,4 +3118,440 @@ mod tests { assert_eq!(rgb_scalar, rgb_simd); assert_eq!(rgb_u16_scalar, rgb_u16_simd); } + + // ---- Yuv420p12 --------------------------------------------------------- + // + // Planar 12-bit, low-bit-packed. Mirrors the Yuv420p10 shape — same + // planar layout, wider sample range. `mid-gray` for 12-bit is + // Y=UV=2048; native-depth white (full-range) is 4095. + + fn solid_yuv420p12_frame( + width: u32, + height: u32, + y: u16, + u: u16, + v: u16, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) + } + + #[test] + fn yuv420p12_rgb_u8_only_gray_is_gray() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn yuv420p12_rgb_u16_only_native_depth_gray() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + // Upper 4 bits must be zero — 12-bit low-packed convention. + assert!(px[0] <= 4095); + } + } + + #[test] + fn yuv420p12_rgb_u8_and_u16_both_populated() { + // Full-range white: Y=4095, UV=2048. + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 4095, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 4095)); + } + + #[test] + fn yuv420p12_luma_downshifts_to_8bit() { + // Y=2048 at 12 bits → 2048 >> (12 - 8) = 128 at 8 bits. + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn yuv420p12_hsv_from_gray_is_zero_hue_zero_sat() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut h = std::vec![0xFFu8; 16 * 8]; + let mut s = std::vec![0xFFu8; 16 * 8]; + let mut v = std::vec![0xFFu8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(128) <= 1)); + } + + #[test] + fn yuv420p12_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); + } + + #[test] + fn yuv420p12_with_simd_false_matches_with_simd_true() { + let (yp, up, vp) = solid_yuv420p12_frame(64, 16, 2400, 1600, 2800); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } + + // ---- Yuv420p14 --------------------------------------------------------- + + fn solid_yuv420p14_frame( + width: u32, + height: u32, + y: u16, + u: u16, + v: u16, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) + } + + #[test] + fn yuv420p14_rgb_u8_only_gray_is_gray() { + // 14-bit mid-gray: Y=UV=8192. + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn yuv420p14_rgb_u16_only_native_depth_gray() { + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(8192) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert!(px[0] <= 16383); + } + } + + #[test] + fn yuv420p14_luma_downshifts_to_8bit() { + // Y=8192 at 14 bits → 8192 >> (14 - 8) = 128. + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn yuv420p14_rgb_u8_and_u16_both_populated() { + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 16383, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 16383)); + } + + #[test] + fn yuv420p14_with_simd_false_matches_with_simd_true() { + let (yp, up, vp) = solid_yuv420p14_frame(64, 16, 9600, 6400, 11200); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } + + // ---- P012 -------------------------------------------------------------- + // + // Semi-planar 12-bit, high-bit-packed (samples in high 12 of each + // u16). Mirrors the P010 test shape — UV interleaved, `value << 4`. + + fn solid_p012_frame( + width: u32, + height: u32, + y_12bit: u16, + u_12bit: u16, + v_12bit: u16, + ) -> (Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + // Shift into the high 12 bits (P012 packing). + let y = std::vec![y_12bit << 4; w * h]; + let uv: Vec = (0..cw * ch) + .flat_map(|_| [u_12bit << 4, v_12bit << 4]) + .collect(); + (y, uv) + } + + #[test] + fn p012_rgb_u8_only_gray_is_gray() { + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn p012_rgb_u16_only_native_depth_gray() { + // Output is low-bit-packed 12-bit (yuv420p12le convention). + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert!( + px[0] <= 4095, + "output must stay within 12-bit low-packed range" + ); + } + } + + #[test] + fn p012_rgb_u8_and_u16_both_populated() { + let (yp, uvp) = solid_p012_frame(16, 8, 4095, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 4095)); + } + + #[test] + fn p012_luma_downshifts_to_8bit() { + // Y=2048 at 12 bits, P012-packed (2048 << 4 = 0x8000). After >> 8, + // the 8-bit luma is 0x80 = 128 — same accessor as P010 since both + // store active bits in the high positions. + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn p012_matches_yuv420p12_mixed_sinker_with_shifted_samples() { + // Logical equivalence — same 12-bit samples fed through both + // layouts must produce byte-identical u8 RGB. + let w = 16u32; + let h = 8u32; + let y = 2400u16; + let u = 1600u16; + let v = 2800u16; + + let (yp_p12, up_p12, vp_p12) = solid_yuv420p12_frame(w, h, y, u, v); + let src_p12 = Yuv420p12Frame::new(&yp_p12, &up_p12, &vp_p12, w, h, w, w / 2, w / 2); + + let (yp_p012, uvp_p012) = solid_p012_frame(w, h, y, u, v); + let src_p012 = P012Frame::new(&yp_p012, &uvp_p012, w, h, w, w); + + let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize]; + let mut rgb_p012 = std::vec![0u8; (w * h * 3) as usize]; + let mut s_yuv = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_yuv) + .unwrap(); + let mut s_p012 = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_p012) + .unwrap(); + yuv420p12_to(&src_p12, true, ColorMatrix::Bt709, &mut s_yuv).unwrap(); + p012_to(&src_p012, true, ColorMatrix::Bt709, &mut s_p012).unwrap(); + assert_eq!(rgb_yuv, rgb_p012); + } + + #[test] + fn p012_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); + } + + #[test] + fn p012_with_simd_false_matches_with_simd_true() { + let (yp, uvp) = solid_p012_frame(64, 16, 2400, 1600, 2800); + let src = P012Frame::new(&yp, &uvp, 64, 16, 64, 64); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + p012_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + p012_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } } diff --git a/src/sinker/mod.rs b/src/sinker/mod.rs index e6d6d0a..90ce325 100644 --- a/src/sinker/mod.rs +++ b/src/sinker/mod.rs @@ -1,10 +1,11 @@ //! [`PixelSink`](crate::PixelSink) implementations shipped with the //! crate. //! -//! v0.1 ships [`MixedSinker`](mixed::MixedSinker), which writes any -//! subset of `{RGB, Luma, HSV}` into caller-provided buffers. Narrow -//! newtype shortcuts (luma-only, RGB-only, HSV-only) will be added in -//! follow-up commits once the MixedSinker path is proven. +//! Currently ships [`MixedSinker`](mixed::MixedSinker), which writes +//! any subset of `{RGB, Luma, HSV}` into caller-provided buffers. +//! It has per-format `PixelSink` impls for all eight shipped YUV +//! source formats (see [`crate::yuv`] for the list). Narrow newtype +//! shortcuts (luma-only, RGB-only, HSV-only) are a follow-up. //! //! `MixedSinker` keeps a lazily‑grown `Vec` scratch buffer for //! the HSV‑without‑RGB path, so it is only compiled under the `std` diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs index b3f1f4c..fcbe395 100644 --- a/src/yuv/mod.rs +++ b/src/yuv/mod.rs @@ -1,6 +1,9 @@ //! YUV source kernels. //! -//! One sub-module and kernel per YUV pixel-format family: +//! One sub-module and kernel per YUV pixel-format family. +//! +//! # Shipped (8-bit 4:2:0) +//! //! - [`Yuv420p`](crate::yuv::Yuv420p) — the mainline 4:2:0 **planar** //! layout (H.264 / HEVC / AV1 / VP9 software‑decode default). //! - [`Nv12`](crate::yuv::Nv12) — 4:2:0 **semi‑planar** with interleaved @@ -8,12 +11,18 @@ //! default). //! - [`Nv21`](crate::yuv::Nv21) — 4:2:0 semi‑planar with **VU**-ordered //! chroma (Android MediaCodec default). +//! +//! # Shipped (high-bit-depth 4:2:0, low-bit-packed planar) +//! //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits //! per sample (HDR10 / 10‑bit SDR software decode). //! - [`Yuv420p12`](crate::yuv::Yuv420p12) — 4:2:0 planar at 12 bits //! per sample (HEVC Main 12 / VP9 Profile 3 software decode). //! - [`Yuv420p14`](crate::yuv::Yuv420p14) — 4:2:0 planar at 14 bits //! per sample (grading / mastering pipelines). +//! +//! # Shipped (high-bit-depth 4:2:0, high-bit-packed semi-planar) +//! //! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per //! sample, high‑bit‑packed (HDR hardware decode: VideoToolbox, //! VA‑API, NVDEC, D3D11VA, Intel QSV). @@ -21,7 +30,21 @@ //! sample, high‑bit‑packed (HEVC Main 12 / VP9 Profile 3 hardware //! decode). //! -//! Other families land in follow-up commits. +//! # Not yet shipped +//! +//! - **16‑bit** (`Yuv420p16` / `P016`) — blocked on a separate +//! kernel family. At `BITS == 16` the Q15 chroma_sum overflows +//! i32, so this needs either i64 intermediates or a lower‑Q +//! coefficient format. The scalar and SIMD kernels here +//! deliberately gate `BITS` to `{10, 12, 14}` (planar) and +//! `{10, 12}` (semi‑planar) via `debug_assert!`. +//! - **4:2:2 / 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`, +//! `Nv42`) — follow‑up, not yet started. They share the scalar +//! Q15 math but need their own row walkers (different chroma +//! subsampling / stride). +//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`, +//! `Rgba1010102`, etc.) — follow‑up. Will land as their own +//! family of `*_to` kernels feeding a new row‑shape subtrait. mod nv12; mod nv21; diff --git a/src/yuv/yuv420p10.rs b/src/yuv/yuv420p10.rs index 1a85e06..812f180 100644 --- a/src/yuv/yuv420p10.rs +++ b/src/yuv/yuv420p10.rs @@ -4,14 +4,14 @@ //! plus U / V at half width and half height — but sample width is //! **`u16`** (10 active bits in the low bits of each element). The //! [`Yuv420p10Frame`] type alias pins the bit depth; the underlying -//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` so 12‑bit -//! and 14‑bit variants can be added by relaxing its validator without -//! changing kernel math. +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` and the +//! 12‑bit / 14‑bit siblings ([`super::Yuv420p12`] / [`super::Yuv420p14`]) +//! reuse the same scalar + SIMD kernel family with a different +//! monomorphization. //! -//! Ships in colconv v0.2 as the first high‑bit‑depth format (HDR / -//! 10‑bit SDR keystone). Kernel semantics match [`super::Yuv420p`]: -//! two consecutive Y rows share one chroma row (4:2:0), chroma is -//! nearest‑neighbor upsampled in registers inside the row primitive. +//! Kernel semantics match [`super::Yuv420p`]: two consecutive Y rows +//! share one chroma row (4:2:0), chroma is nearest‑neighbor upsampled +//! in registers inside the row primitive. use crate::{ ColorMatrix, PixelSink, SourceFormat, @@ -22,10 +22,11 @@ use crate::{ /// Zero‑sized marker for the YUV 4:2:0 **10‑bit** source format. Used /// as the `F` type parameter on [`crate::sinker::MixedSinker`]. /// -/// colconv v0.2 ships only the 10‑bit specialization; 12‑ and 14‑bit -/// will arrive as separate markers (`Yuv420p12`, `Yuv420p14`) that -/// refer to the same underlying [`Yuv420pFrame16`] struct with -/// different `BITS` values. +/// 12‑bit and 14‑bit siblings ship as separate markers +/// ([`super::Yuv420p12`] / [`super::Yuv420p14`]) on the same +/// [`Yuv420pFrame16`] struct with different `BITS` values. 16‑bit +/// needs a different kernel family (Q15 chroma_sum overflows i32) and +/// is not yet shipped. #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] pub struct Yuv420p10;