diff --git a/Cargo.toml b/Cargo.toml index 458d138..4c98087 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,10 +32,22 @@ harness = false name = "yuv_420p10_to_rgb" harness = false +[[bench]] +name = "yuv_420p12_to_rgb" +harness = false + +[[bench]] +name = "yuv_420p14_to_rgb" +harness = false + [[bench]] name = "p010_to_rgb" harness = false +[[bench]] +name = "p012_to_rgb" +harness = false + [[bench]] name = "rgb_to_hsv" harness = false diff --git a/benches/p012_to_rgb.rs b/benches/p012_to_rgb.rs new file mode 100644 index 0000000..9443f6f --- /dev/null +++ b/benches/p012_to_rgb.rs @@ -0,0 +1,94 @@ +//! Per‑row P012 (semi‑planar 4:2:0, 12‑bit, high‑bit‑packed) → RGB +//! throughput baseline. +//! +//! Mirrors [`p010_to_rgb`] but feeds 12‑bit high‑bit‑packed samples +//! (12 active bits in the high 12 of each `u16`, low 4 bits zero). + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{p012_to_rgb_row, p012_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic P012‑packed pseudo‑random +/// sequence — 12‑bit values shifted into the high 12 bits of each +/// `u16` (low 4 bits zero), matching the real P012 storage layout. +fn fill_pseudo_random_p012(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = (((state >> 8) & 0xFFF) as u16) << 4; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("p012_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + // UV row payload is `width` u16 elements (w / 2 interleaved pairs). + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p012(&mut y, 0x1111); + fill_pseudo_random_p012(&mut uv, 0x2222); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p012_to_rgb_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("p012_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p012(&mut y, 0x1111); + fill_pseudo_random_p012(&mut uv, 0x2222); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p012_to_rgb_u16_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/benches/yuv_420p12_to_rgb.rs b/benches/yuv_420p12_to_rgb.rs new file mode 100644 index 0000000..cba3e28 --- /dev/null +++ b/benches/yuv_420p12_to_rgb.rs @@ -0,0 +1,100 @@ +//! Per‑row YUV 4:2:0 12‑bit → packed RGB throughput baseline. +//! +//! Mirrors [`yuv_420p10_to_rgb`] but feeds 12‑bit low‑bit‑packed +//! samples (values ≤ 4095). Same `u8_*` / `u16_*` split per width so +//! scalar vs SIMD speedup is a two‑line comparison in the Criterion +//! report. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic 12‑bit pseudo‑random +/// sequence — values occupy the low 12 bits of each `u16`, matching +/// the storage layout of `yuv420p12le`. +fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = ((state >> 8) & 0xFFF) as u16; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("yuv420p12_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p12_to_rgb_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("yuv420p12_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p12_to_rgb_u16_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/benches/yuv_420p14_to_rgb.rs b/benches/yuv_420p14_to_rgb.rs new file mode 100644 index 0000000..ac6e5ee --- /dev/null +++ b/benches/yuv_420p14_to_rgb.rs @@ -0,0 +1,100 @@ +//! Per‑row YUV 4:2:0 14‑bit → packed RGB throughput baseline. +//! +//! Mirrors [`yuv_420p10_to_rgb`] but feeds 14‑bit low‑bit‑packed +//! samples (values ≤ 16383). Same `u8_*` / `u16_*` split per width so +//! scalar vs SIMD speedup is a two‑line comparison in the Criterion +//! report. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic 14‑bit pseudo‑random +/// sequence — values occupy the low 14 bits of each `u16`, matching +/// the storage layout of `yuv420p14le`. +fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = ((state >> 8) & 0x3FFF) as u16; + } +} + +fn bench(c: &mut Criterion) { + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + let mut group_u8 = c.benchmark_group("yuv420p14_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p14_to_rgb_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + let mut group_u16 = c.benchmark_group("yuv420p14_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u16; w * 3]; + + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p14_to_rgb_u16_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/src/frame.rs b/src/frame.rs index 8becc63..a56353f 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -496,30 +496,30 @@ pub enum Nv12FrameError { /// /// # Input sample range and packing sanity /// -/// Each `u16` sample's 10 active bits live in the high 10 positions; -/// the low 6 bits are expected to be zero. [`Self::try_new`] validates -/// geometry only. +/// Each `u16` sample's `BITS` active bits live in the high `BITS` +/// positions; the low `16 - BITS` bits are expected to be zero. +/// [`Self::try_new`] validates geometry only. /// /// [`Self::try_new_checked`] additionally scans every sample and -/// rejects any with non‑zero low 6 bits — a **necessary but not -/// sufficient** packing sanity check. It catches mispacked -/// `yuv420p10le` buffers as long as **at least one** sample has -/// low‑bit content (the usual case for noisy real‑world image data), -/// but it **cannot distinguish** P010 from a `yuv420p10le` buffer -/// whose samples all happen to be multiples of 64. Values like -/// `Y = 64` (limited‑range black) and `UV = 512` (neutral chroma) -/// both have low 6 bits zero and so pass the check, even though the -/// buffer layout is wrong. For strict provenance, callers must rely -/// on their source format metadata and pick the right frame type -/// ([`P010Frame`] vs [`Yuv420p10Frame`]) at construction. +/// rejects any with non‑zero low `16 - BITS` bits — a **necessary +/// but not sufficient** packing sanity check. Its catch rate +/// weakens as `BITS` grows: at `BITS == 10` it rejects 63/64 random +/// samples and is a strong signal; at `BITS == 12` it only rejects +/// 15/16, and **common flat‑region values in decoder output are +/// exactly the ones that slip through** (`Y = 256/1024` limited +/// black, `UV = 2048` neutral chroma are all multiples of 16 in +/// both layouts). See [`Self::try_new_checked`] for the full +/// table. For strict provenance, callers must rely on their source +/// format metadata and pick the right frame type ([`PnFrame`] vs +/// [`Yuv420pFrame16`]) at construction. /// -/// Kernels shift each load right by 6 to extract the 10‑bit value, -/// so mispacked input (e.g. a `yuv420p10le` buffer handed to the -/// P010 kernel) produces deterministic, backend‑independent output -/// — wrong colors, but consistently wrong across scalar + every -/// SIMD backend, which is visible in any output diff. +/// Kernels shift each load right by `16 - BITS` to extract the +/// active value, so mispacked input (e.g. a `yuv420p12le` buffer +/// handed to the P012 kernel) produces deterministic, backend‑ +/// independent output — wrong colors, but consistently wrong across +/// scalar + every SIMD backend, which is visible in any output diff. #[derive(Debug, Clone, Copy)] -pub struct P010Frame<'a> { +pub struct PnFrame<'a, const BITS: u32> { y: &'a [u16], uv: &'a [u16], width: u32, @@ -528,7 +528,7 @@ pub struct P010Frame<'a> { uv_stride: u32, } -impl<'a> P010Frame<'a> { +impl<'a, const BITS: u32> PnFrame<'a, BITS> { /// Constructs a new [`P010Frame`], validating dimensions and plane /// lengths. Strides are in `u16` **samples**. /// @@ -548,19 +548,26 @@ impl<'a> P010Frame<'a> { height: u32, y_stride: u32, uv_stride: u32, - ) -> Result { + ) -> Result { + // Guard the `BITS` parameter at the top — 10 and 12 are the only + // high-bit-packed depths supported by the Q15 kernel family. 14 + // exists in the planar `yuv420p14le` family but not as a Pn + // hardware output; 16 would need i64 intermediates. + if BITS != 10 && BITS != 12 { + return Err(PnFrameError::UnsupportedBits { bits: BITS }); + } if width == 0 || height == 0 { - return Err(P010FrameError::ZeroDimension { width, height }); + return Err(PnFrameError::ZeroDimension { width, height }); } if width & 1 != 0 { - return Err(P010FrameError::OddWidth { width }); + return Err(PnFrameError::OddWidth { width }); } if y_stride < width { - return Err(P010FrameError::YStrideTooSmall { width, y_stride }); + return Err(PnFrameError::YStrideTooSmall { width, y_stride }); } let uv_row_elems = width; if uv_stride < uv_row_elems { - return Err(P010FrameError::UvStrideTooSmall { + return Err(PnFrameError::UvStrideTooSmall { uv_row_elems, uv_stride, }); @@ -569,14 +576,14 @@ impl<'a> P010Frame<'a> { let y_min = match (y_stride as usize).checked_mul(height as usize) { Some(v) => v, None => { - return Err(P010FrameError::GeometryOverflow { + return Err(PnFrameError::GeometryOverflow { stride: y_stride, rows: height, }); } }; if y.len() < y_min { - return Err(P010FrameError::YPlaneTooShort { + return Err(PnFrameError::YPlaneTooShort { expected: y_min, actual: y.len(), }); @@ -585,14 +592,14 @@ impl<'a> P010Frame<'a> { let uv_min = match (uv_stride as usize).checked_mul(chroma_height as usize) { Some(v) => v, None => { - return Err(P010FrameError::GeometryOverflow { + return Err(PnFrameError::GeometryOverflow { stride: uv_stride, rows: chroma_height, }); } }; if uv.len() < uv_min { - return Err(P010FrameError::UvPlaneTooShort { + return Err(PnFrameError::UvPlaneTooShort { expected: uv_min, actual: uv.len(), }); @@ -621,34 +628,49 @@ impl<'a> P010Frame<'a> { ) -> Self { match Self::try_new(y, uv, width, height, y_stride, uv_stride) { Ok(frame) => frame, - Err(_) => panic!("invalid P010Frame dimensions or plane lengths"), + Err(_) => panic!("invalid PnFrame dimensions, plane lengths, or BITS value"), } } /// Like [`Self::try_new`] but additionally scans every sample and - /// rejects any whose **low 6 bits** are non‑zero. A valid P010 - /// sample has its 10 active bits in the high 10 positions and zero - /// below, so non‑zero low bits is evidence the buffer isn't P010. + /// rejects any whose **low `16 - BITS` bits** are non‑zero. A valid + /// high‑bit‑packed sample has its `BITS` active bits in the high + /// `BITS` positions and zero below, so non‑zero low bits is + /// evidence the buffer isn't Pn‑shaped. /// /// **This is a packing sanity check, not a provenance validator.** - /// The check catches noisy `yuv420p10le` data (where most samples - /// have low‑bit content), but it **cannot** distinguish P010 from - /// a `yuv420p10le` buffer whose samples all happen to be multiples - /// of 64. Common flat‑region values like `Y = 64` (limited‑range - /// black) or `UV = 512` (neutral chroma) are multiples of 64 in - /// both layouts, so a yuv420p10le buffer of flat content will - /// silently pass this check. Callers who need strict provenance - /// must rely on their source format metadata and pick the right - /// frame type at construction ([`P010Frame`] vs [`Yuv420p10Frame`]); - /// no runtime check on opaque `u16` data can reliably tell the two - /// layouts apart. + /// The check catches noisy low‑bit‑packed data (where most samples + /// have low‑bit content), but it **cannot** distinguish Pn from a + /// low‑bit‑packed buffer whose samples all happen to be multiples + /// of `1 << (16 - BITS)`. The catch rate scales with `BITS`: + /// + /// - `BITS == 10` (P010): 6 low bits must be zero. Random u16 + /// samples pass with probability `1/64`; noisy `yuv420p10le` + /// data is almost always caught. + /// - `BITS == 12` (P012): only 4 low bits. Pass probability is + /// `1/16` — 4× weaker. **Common limited‑range flat‑region values + /// (`Y = 256` limited black, `UV = 2048` neutral chroma, + /// `Y = 1024` full black) are all multiples of 16 in both + /// layouts**, so flat `yuv420p12le` content passes **every + /// time**. The `>> 4` extraction in the Pn kernels then + /// discards the real signal and produces badly darkened + /// output. For P012, prefer format metadata over this check. + /// + /// Callers who need strict provenance must rely on their source + /// format metadata and pick the right frame type at construction + /// ([`PnFrame`] vs [`Yuv420pFrame16`]); no runtime check on opaque + /// `u16` data can reliably tell the two layouts apart, and the + /// weakness is proportionally worse the higher the `BITS` value. + /// The regression test + /// `p012_try_new_checked_accepts_low_packed_flat_content_by_design` + /// in `frame::tests` pins this limitation in code. /// /// Cost: one O(plane_size) scan per plane. The default /// [`Self::try_new`] skips this so the hot path stays O(1). /// - /// Returns [`P010FrameError::SampleLowBitsSet`] on the first - /// offending sample — carries the plane, element index, and - /// offending value. + /// Returns [`PnFrameError::SampleLowBitsSet`] on the first + /// offending sample — carries the plane, element index, offending + /// value, and the number of low bits expected to be zero. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( y: &'a [u16], @@ -657,8 +679,10 @@ impl<'a> P010Frame<'a> { height: u32, y_stride: u32, uv_stride: u32, - ) -> Result { + ) -> Result { let frame = Self::try_new(y, uv, width, height, y_stride, uv_stride)?; + let low_bits = 16 - BITS; + let low_mask: u16 = ((1u32 << low_bits) - 1) as u16; let w = width as usize; let h = height as usize; let uv_w = w; // interleaved: `width / 2` pairs × 2 elements @@ -666,11 +690,12 @@ impl<'a> P010Frame<'a> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s & 0x3F != 0 { - return Err(P010FrameError::SampleLowBitsSet { - plane: P010FramePlane::Y, + if s & low_mask != 0 { + return Err(PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Y, index: start + col, value: s, + low_bits, }); } } @@ -678,11 +703,12 @@ impl<'a> P010Frame<'a> { for row in 0..chroma_h { let start = row * uv_stride as usize; for (col, &s) in uv[start..start + uv_w].iter().enumerate() { - if s & 0x3F != 0 { - return Err(P010FrameError::SampleLowBitsSet { - plane: P010FramePlane::Uv, + if s & low_mask != 0 { + return Err(PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Uv, index: start + col, value: s, + low_bits, }); } } @@ -730,23 +756,51 @@ impl<'a> P010Frame<'a> { pub const fn uv_stride(&self) -> u32 { self.uv_stride } + + /// Active bit depth — 10 or 12. Mirrors the `BITS` const parameter + /// so generic code can read it without naming the type. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn bits(&self) -> u32 { + BITS + } } -/// Identifies which plane of a [`P010Frame`] a -/// [`P010FrameError::SampleLowBitsSet`] refers to. +/// Type alias for a validated P010 frame (10‑bit, high‑bit‑packed). +/// Use this name at call sites for readability. +pub type P010Frame<'a> = PnFrame<'a, 10>; + +/// Type alias for a validated P012 frame (12‑bit, high‑bit‑packed). +/// Same layout as [`P010Frame`] but with 12 active bits in the high +/// 12 of each `u16` (`sample = value << 4`, low 4 bits zero). +pub type P012Frame<'a> = PnFrame<'a, 12>; + +/// Identifies which plane of a [`PnFrame`] a +/// [`PnFrameError::SampleLowBitsSet`] refers to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)] -pub enum P010FramePlane { +pub enum PnFramePlane { /// Luma plane. Y, /// Interleaved UV plane. Uv, } -/// Errors returned by [`P010Frame::try_new`] and -/// [`P010Frame::try_new_checked`]. +/// Back‑compat alias for the pre‑generalization plane enum name. +pub type P010FramePlane = PnFramePlane; + +/// Errors returned by [`PnFrame::try_new`] and +/// [`PnFrame::try_new_checked`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)] #[non_exhaustive] -pub enum P010FrameError { +pub enum PnFrameError { + /// `BITS` was not one of the supported high‑bit‑packed depths + /// (10, 12). 14 exists in the planar `yuv420p14le` family but not + /// as a Pn hardware output; 16 would need a different kernel + /// family. + #[error("unsupported BITS ({bits}) for PnFrame; must be 10 or 12")] + UnsupportedBits { + /// The unsupported value of the `BITS` const parameter. + bits: u32, + }, /// `width` or `height` was zero. #[error("width ({width}) or height ({height}) is zero")] ZeroDimension { @@ -803,29 +857,34 @@ pub enum P010FrameError { /// Row count that overflowed against the stride. rows: u32, }, - /// A sample's low 6 bits were non‑zero — P010 packs its 10 active - /// bits in the high 10 of each `u16`, so valid samples are always - /// multiples of 64 (`value << 6`). Only - /// [`P010Frame::try_new_checked`] can produce this error. + /// A sample's low `16 - BITS` bits were non‑zero — a Pn sample + /// packs its `BITS` active bits in the high `BITS` of each `u16`, + /// so valid samples are always multiples of `1 << (16 - BITS)` + /// (64 for 10‑bit, 16 for 12‑bit). Only + /// [`PnFrame::try_new_checked`] can produce this error. /// /// Note: the absence of this error does **not** prove the buffer - /// is P010. A `yuv420p10le` buffer of samples that all happen to - /// be multiples of 64 (e.g. `Y = 64`, `UV = 512`) passes the - /// check silently. See [`P010Frame::try_new_checked`] for the - /// full discussion. + /// is Pn. A low‑bit‑packed buffer of samples that all happen to be + /// multiples of `1 << (16 - BITS)` passes the check silently. See + /// [`PnFrame::try_new_checked`] for the full discussion. #[error( - "sample {value:#06x} on plane {plane} at element {index} has non-zero low 6 bits (not a valid P010 sample)" + "sample {value:#06x} on plane {plane} at element {index} has non-zero low {low_bits} bits (not a valid Pn sample at the declared BITS)" )] SampleLowBitsSet { /// Which plane the offending sample lives on. - plane: P010FramePlane, + plane: PnFramePlane, /// Element index within that plane's slice. index: usize, /// The offending sample value. value: u16, + /// Number of low bits expected to be zero (`16 - BITS`). + low_bits: u32, }, } +/// Back‑compat alias for the pre‑generalization error enum name. +pub type P010FrameError = PnFrameError; + /// A validated NV21 (semi‑planar 4:2:0) frame. /// /// Structurally identical to [`Nv12Frame`] — one full-size luma plane @@ -1094,11 +1153,13 @@ pub enum Nv21FrameError { /// [`Self::try_new_checked`] — it scans every sample and returns /// [`Yuv420pFrame16Error::SampleOutOfRange`] on the first violation. /// -/// colconv v0.2 ships `BITS == 10` only (the use‑case keystone for -/// HDR and 10‑bit SDR). 12 and 14 are mechanical follow‑ups that -/// just relax the constructor's `BITS` check and add tiered aliases -/// — the kernel math (Q15 coefficients + i32 intermediates) works -/// unchanged across all three, derived at compile time from `BITS`. +/// All three supported depths — `BITS == 10` (HDR10 / 10‑bit SDR +/// keystone), `BITS == 12` (HEVC Main 12 / VP9 Profile 3), and +/// `BITS == 14` (grading / mastering pipelines) — share the same +/// scalar + SIMD kernel family. The Q15 coefficients + i32 +/// intermediates work unchanged across all three, derived at +/// compile time from `BITS`; the constructor validates the `BITS` +/// value against the set `{10, 12, 14}` up front. /// /// 16‑bit input (which would overflow the i32 chroma sum in the /// Q15 path) is **not** represented by this type — it needs a @@ -1130,8 +1191,9 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { /// lengths, and the `BITS` parameter. /// /// Returns [`Yuv420pFrame16Error`] if any of: - /// - `BITS` is not 10, 12, or 14 (colconv v0.2 additionally rejects - /// 12/14 at the type alias layer — see [`Yuv420p10Frame`]), + /// - `BITS` is not 10, 12, or 14 — use [`Yuv420p10Frame`], + /// [`Yuv420p12Frame`], or [`Yuv420p14Frame`] at call sites for + /// readability, all three are type aliases over this struct, /// - `width` or `height` is zero, /// - `width` is odd, /// - any stride is smaller than the plane's declared pixel width, @@ -1411,6 +1473,20 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { /// for readability. pub type Yuv420p10Frame<'a> = Yuv420pFrame16<'a, 10>; +/// Type alias for a validated YUV 4:2:0 planar frame at 12 bits per +/// sample (`AV_PIX_FMT_YUV420P12LE`). Tight wrapper over +/// [`Yuv420pFrame16`] with `BITS == 12` — same low‑bit‑packed `u16` +/// layout as [`Yuv420p10Frame`], just with 12 active bits in the +/// low 12 of each element (upper 4 bits zero). +pub type Yuv420p12Frame<'a> = Yuv420pFrame16<'a, 12>; + +/// Type alias for a validated YUV 4:2:0 planar frame at 14 bits per +/// sample (`AV_PIX_FMT_YUV420P14LE`). Tight wrapper over +/// [`Yuv420pFrame16`] with `BITS == 14` — same low‑bit‑packed `u16` +/// layout as [`Yuv420p10Frame`], just with 14 active bits in the +/// low 14 of each element (upper 2 bits zero). +pub type Yuv420p14Frame<'a> = Yuv420pFrame16<'a, 14>; + /// Errors returned by [`Yuv420pFrame16::try_new`]. Variant shape /// mirrors [`Yuv420pFrameError`], with `UnsupportedBits` added for /// the new `BITS` parameter and all sizes expressed in **samples** @@ -2163,28 +2239,28 @@ mod tests { fn p010_try_new_rejects_odd_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 15, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::OddWidth { width: 15 })); + assert!(matches!(e, PnFrameError::OddWidth { width: 15 })); } #[test] fn p010_try_new_rejects_zero_dim() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 0, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::ZeroDimension { .. })); + assert!(matches!(e, PnFrameError::ZeroDimension { .. })); } #[test] fn p010_try_new_rejects_y_stride_under_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 16, 8, 8, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YStrideTooSmall { .. })); + assert!(matches!(e, PnFrameError::YStrideTooSmall { .. })); } #[test] fn p010_try_new_rejects_uv_stride_under_width() { let (y, uv) = p010_planes(); let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 8).unwrap_err(); - assert!(matches!(e, P010FrameError::UvStrideTooSmall { .. })); + assert!(matches!(e, PnFrameError::UvStrideTooSmall { .. })); } #[test] @@ -2192,7 +2268,7 @@ mod tests { let y = std::vec![0u16; 10]; let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::YPlaneTooShort { .. })); } #[test] @@ -2200,11 +2276,11 @@ mod tests { let y = std::vec![0u16; 16 * 8]; let uv = std::vec![0x8000u16; 8]; let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::UvPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::UvPlaneTooShort { .. })); } #[test] - #[should_panic(expected = "invalid P010Frame")] + #[should_panic(expected = "invalid PnFrame")] fn p010_new_panics_on_invalid() { let y = std::vec![0u16; 10]; let uv = std::vec![0x8000u16; 16 * 4]; @@ -2218,7 +2294,7 @@ mod tests { let y: [u16; 0] = []; let uv: [u16; 0] = []; let e = P010Frame::try_new(&y, &uv, big, big, big, big).unwrap_err(); - assert!(matches!(e, P010FrameError::GeometryOverflow { .. })); + assert!(matches!(e, PnFrameError::GeometryOverflow { .. })); } #[test] @@ -2239,7 +2315,7 @@ mod tests { let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); match e { - P010FrameError::SampleLowBitsSet { plane, value, .. } => { + PnFrameError::SampleLowBitsSet { plane, value, .. } => { assert_eq!(plane, P010FramePlane::Y); assert_eq!(value, 0x03FF); } @@ -2255,7 +2331,7 @@ mod tests { let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); assert!(matches!( e, - P010FrameError::SampleLowBitsSet { + PnFrameError::SampleLowBitsSet { plane: P010FramePlane::Uv, value: 0x0001, .. @@ -2268,7 +2344,7 @@ mod tests { let y = std::vec![0u16; 10]; // Too small. let uv = std::vec![0x8000u16; 16 * 4]; let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); - assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + assert!(matches!(e, PnFrameError::YPlaneTooShort { .. })); } /// Regression documenting a **known limitation** of @@ -2300,4 +2376,74 @@ mod tests { // source values). That's accepted behavior — the type system, // not `try_new_checked`, is what keeps yuv420p10le out of P010. } + + #[test] + fn p012_try_new_checked_accepts_shifted_samples() { + // Valid P012 samples: low 4 bits zero (12-bit value << 4). + let y = std::vec![(2048u16) << 4; 16 * 8]; // 12-bit mid-gray shifted up + let uv = std::vec![(2048u16) << 4; 16 * 4]; + P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).expect("shifted samples valid"); + } + + #[test] + fn p012_try_new_checked_rejects_low_bits_set() { + // A Y sample with any of the low 4 bits set — e.g. yuv420p12le + // value 0x0ABC landing where P012 expects `value << 4`. The check + // catches samples like this that are obviously mispacked. + let mut y = std::vec![(2048u16) << 4; 16 * 8]; + y[3 * 16 + 5] = 0x0ABC; // low 4 bits = 0xC ≠ 0 + let uv = std::vec![(2048u16) << 4; 16 * 4]; + let e = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + match e { + PnFrameError::SampleLowBitsSet { + plane, + value, + low_bits, + .. + } => { + assert_eq!(plane, PnFramePlane::Y); + assert_eq!(value, 0x0ABC); + assert_eq!(low_bits, 4); + } + other => panic!("expected SampleLowBitsSet, got {other:?}"), + } + } + + /// Regression documenting a **worse known limitation** of + /// [`P012Frame::try_new_checked`] compared to P010: because the + /// low‑bits check only has 4 bits to work with at `BITS == 12`, + /// every multiple‑of‑16 `yuv420p12le` value passes silently. The + /// practical impact is that common limited‑range flat‑region + /// content in real decoder output — `Y = 256` (limited‑range + /// black), `UV = 2048` (neutral chroma), `Y = 1024` (full black) + /// — is entirely invisible to this check. + /// + /// This test pins the limitation with a reproducible input so + /// that: + /// 1. Users reading the test suite can see the exact failure + /// mode for `try_new_checked` on 12‑bit data. + /// 2. Any future attempt to strengthen `try_new_checked` (e.g., + /// into a statistical provenance heuristic) has a concrete + /// input to validate against. + /// 3. The `PnFrame` docs' warning about this limitation has a + /// named test to point to. + /// + /// For P012, the type system (choosing [`P012Frame`] vs + /// [`Yuv420p12Frame`] at construction based on decoder metadata) + /// is the only reliable provenance guarantee. + #[test] + fn p012_try_new_checked_accepts_low_packed_flat_content_by_design() { + // All values are multiples of 16 — exactly the set that slips + // through a 4-low-bits-zero check. `yuv420p12le` limited-range + // black and neutral chroma both satisfy this. + let y = std::vec![0x0100u16; 16 * 8]; // Y = 256 (limited-range black), multiple of 16 + let uv = std::vec![0x0800u16; 16 * 4]; // UV = 2048 (neutral chroma), multiple of 16 + let f = P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("known limitation: 4-low-bits-zero check cannot tell yuv420p12le from P012"); + assert_eq!(f.width(), 16); + // Downstream P012 kernels would extract `>> 4` — giving Y=16 and + // UV=128 instead of the intended Y=256 and UV=2048. Silent color + // corruption. The type system, not `try_new_checked`, must + // guarantee provenance for 12-bit. + } } diff --git a/src/lib.rs b/src/lib.rs index ec97890..40b295d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,8 +24,46 @@ //! [`with_hsv`](sinker::MixedSinker::with_hsv) to select which channels //! to derive. //! -//! The crate design also follows a per-format expansion plan with -//! defined implementation priority tiers for the conversion kernels. +//! # Supported source formats +//! +//! Shipped (all 4:2:0 subsampling): +//! +//! | Family | Bit depth | Packing | FFmpeg name | +//! | ---------------- | --------- | ---------------------- | --------------------- | +//! | [`Yuv420p`] | 8 | planar | `yuv420p` | +//! | [`Nv12`] | 8 | semi-planar UV | `nv12` | +//! | [`Nv21`] | 8 | semi-planar VU | `nv21` | +//! | [`Yuv420p10`] | 10 | planar, low-packed | `yuv420p10le` | +//! | [`Yuv420p12`] | 12 | planar, low-packed | `yuv420p12le` | +//! | [`Yuv420p14`] | 14 | planar, low-packed | `yuv420p14le` | +//! | [`P010`] | 10 | semi-planar, high-packed | `p010le` | +//! | [`P012`] | 12 | semi-planar, high-packed | `p012le` | +//! +//! Not yet shipped (follow-up): +//! +//! - **16‑bit families** (`Yuv420p16` / `P016`) — require a separate +//! kernel family because the Q15 chroma_sum overflows i32 at +//! `BITS == 16`. Current scalar / SIMD kernels `debug_assert!` out +//! `BITS == 16` precisely to surface this. +//! - **4:2:2 and 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`, +//! `Nv42`) — share the Q15 math but need their own row walkers +//! for the different chroma subsampling / stride. +//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`, +//! `Rgba1010102`, etc.). +//! +//! See [`yuv`] for the per-format module-level breakdown and +//! [`frame`] for the validated frame types plus the `BITS` const +//! generic on the high-bit-depth families (`Yuv420pFrame16` +//! and `PnFrame`). +//! +//! [`Yuv420p`]: crate::yuv::Yuv420p +//! [`Nv12`]: crate::yuv::Nv12 +//! [`Nv21`]: crate::yuv::Nv21 +//! [`Yuv420p10`]: crate::yuv::Yuv420p10 +//! [`Yuv420p12`]: crate::yuv::Yuv420p12 +//! [`Yuv420p14`]: crate::yuv::Yuv420p14 +//! [`P010`]: crate::yuv::P010 +//! [`P012`]: crate::yuv::P012 #![cfg_attr(not(feature = "std"), no_std)] #![cfg_attr(docsrs, feature(doc_cfg))] @@ -167,8 +205,9 @@ pub trait PixelSink { } /// Consume one input unit. Called by the kernel once per unit (one - /// row, for the row-granular kernels v0.1 ships). Input borrows may - /// be invalidated after the call returns — implementations must not + /// row, for the row-granular kernels currently shipped). Input + /// borrows may be invalidated after the call returns — + /// implementations must not /// retain them. /// /// Returns `Err` to short-circuit the walker: on the first `Err`, diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index f98d9cd..878d5e3 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -39,8 +39,8 @@ use core::arch::aarch64::{ vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, vld1q_u8, vld1q_u16, vld2_u8, vld2q_u16, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, - vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, - vshrq_n_u16, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, + vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshlq_u16, + vshrq_n_s32, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, }; use crate::{ColorMatrix, row::scalar}; @@ -190,7 +190,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } -/// NEON YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed +/// **8‑bit** RGB. /// /// Block size is 16 Y pixels / 8 chroma pairs per iteration. The /// pipeline mirrors [`yuv_420_to_rgb_row`] byte‑for‑byte; the only @@ -199,16 +200,20 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// (16 lanes of `u8`), so each Y iteration needs two Y loads to /// cover 16 pixels — there's no widening step because the samples /// already live in 16‑bit lanes. -/// - Chroma bias is **512** (10‑bit center) rather than 128. +/// - Chroma bias is `128 << (BITS - 8)` (512 for 10‑bit, 2048 for +/// 12‑bit, 8192 for 14‑bit) rather than 128. /// - Range‑scaling params come from [`scalar::range_params_n`] with -/// `BITS = 10, OUT_BITS = 8`, so `y_scale` / `c_scale` are ~¼ the -/// 8‑bit values (mapping 10‑bit input to 8‑bit output). +/// the matching `BITS` const, so `y_scale` / `c_scale` map the +/// source depth to 8‑bit output in a single Q15 shift. +/// - Each load is AND‑masked to the low `BITS` bits so out‑of‑range +/// samples (e.g. high‑bit‑packed data mistakenly handed to the +/// low‑packed kernel) produce deterministic, backend‑consistent +/// output. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`] — every -/// Q15 multiply / shift mirrors the scalar path exactly, with the -/// same `(prod + (1 << 14)) >> 15` rounding. +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::`] across +/// all supported bit depths. /// /// # Safety /// @@ -216,9 +221,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12, 14}` — the Q15 pipeline +/// overflows i32 at 16 bits; see [`scalar::range_params_n`]. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -234,8 +241,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: NEON availability is the caller's obligation; the @@ -248,7 +255,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); + let mask_v = vdupq_n_u16(scalar::bits_mask::()); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); let cgu = vdupq_n_s32(coeffs.g_u()); @@ -259,11 +266,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let mut x = 0usize; while x + 16 <= width { // Two Y loads cover 16 lanes; one U load + one V load cover 8 - // chroma each. Each load is AND‑masked to the low 10 bits so - // out‑of‑range samples (e.g. `p010`‑style packing with the - // 10 active bits in the high 10 of each u16) can never push - // an intermediate past i16 range. For valid input the AND is - // a no‑op (samples already in [0, 1023]). + // chroma each. Each load is AND‑masked to the low BITS bits so + // out‑of‑range samples (e.g. high‑bit‑packed data handed to + // the low‑packed kernel) can never push an intermediate past + // i16 range. For valid input the AND is a no‑op. let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); @@ -325,7 +331,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( // Scalar tail — remaining < 16 pixels (always even per 4:2:0). if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -338,24 +344,25 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } } -/// NEON YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB (native depth). +/// NEON high‑bit‑depth YUV 4:2:0 (`BITS` ∈ {10, 12, 14}) → packed +/// **native‑depth `u16`** RGB. /// /// Block size is 16 Y pixels / 8 chroma pairs per iteration. Shares -/// all pre‑write math with [`yuv420p10_to_rgb_row`]; the only +/// all pre‑write math with [`yuv_420p_n_to_rgb_row`]; the only /// difference is the final clamp + write: -/// - Y‑path scale is calibrated for `OUT_BITS = 10` rather than 8, -/// so `y_scaled` lives in `[0, 1023]` before the chroma add. -/// - The `y_scaled + chroma` sum is clamped to `[0, 1023]` with -/// `vmaxq_s16(vminq_s16(_, 1023), 0)` — a simple saturate‑narrow -/// doesn't suffice because the sum can overshoot 1023 (up to ~2046 -/// without saturating at i16 bounds). +/// - Y‑path scale is calibrated for `OUT_BITS = BITS` rather than 8, +/// so `y_scaled` lives in `[0, (1 << BITS) - 1]`. +/// - The `y_scaled + chroma` sum is clamped to `[0, (1 << BITS) - 1]` +/// with `vmaxq_s16(vminq_s16(_, max), 0)` — a simple saturate‑ +/// narrow doesn't suffice because the sum can overshoot the +/// `BITS`-bit max without saturating at i16 bounds. /// - Writes use two `vst3q_u16` calls per iteration — each handles 8 /// pixels × 3 channels = 24 `u16` elements, so two cover 16 pixels. /// /// # Numerical contract /// -/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`] — every -/// Q15 multiply / shift / clamp mirrors the scalar reference. +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::`] across +/// supported `BITS` values. /// /// # Safety /// @@ -363,9 +370,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -381,10 +389,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: NEON availability is the caller's obligation; the // dispatcher in `crate::row` verifies it. Pointer adds are bounded @@ -396,8 +404,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); - let max_v = vdupq_n_s16(OUT_MAX_10); + let mask_v = vdupq_n_u16(scalar::bits_mask::()); + let max_v = vdupq_n_s16(out_max); let zero_v = vdupq_n_s16(0); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); @@ -408,9 +416,9 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - // AND‑mask each load to the low 10 bits so intermediates stay - // within the i16 range the Q15 narrow steps expect — see - // matching comment in [`yuv420p10_to_rgb_row`]. + // AND‑mask each load to the low BITS bits so intermediates + // stay within the i16 range the Q15 narrow steps expect — see + // matching comment in [`yuv_420p_n_to_rgb_row`]. let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); @@ -447,9 +455,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); // Native‑depth output: add Y + chroma in i16, then clamp to - // [0, 1023] explicitly. `vqaddq_s16` saturates at i16 bounds - // (irrelevant here since |sum| < 2047 always), so the subsequent - // max/min clamps to the 10‑bit range. + // [0, (1 << BITS) - 1] explicitly. `vqaddq_s16` saturates at + // i16 bounds (irrelevant here: |sum| stays well inside i16 + // for BITS ≤ 14), so the subsequent max/min clamps to the + // native bit depth. let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v); let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v); let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v); @@ -467,7 +476,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -481,33 +490,35 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } /// Clamps an i16x8 vector to `[0, max]` and reinterprets to u16x8. -/// Used by the 10‑bit u16 output path to avoid `vqmovun_s16`'s u8 -/// saturation. +/// Used by the native‑depth u16 output paths to avoid `vqmovun_s16`'s +/// u8 saturation. #[inline(always)] fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) } } -/// NEON P010 → packed **8‑bit** RGB. +/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}: P010, P012) +/// → packed **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences -/// from [`yuv420p10_to_rgb_row`]: +/// from [`yuv_420p_n_to_rgb_row`]: /// - UV is semi‑planar interleaved (`U0, V0, U1, V1, …`), split in /// one shot via `vld2q_u16` (returns separate U and V vectors). -/// - Each `u16` load is **shifted right by 6** (`vshrq_n_u16::<6>`) -/// instead of AND‑masked — P010 packs its 10 active bits in the -/// HIGH 10 of each `u16`, so `>> 6` extracts the value and -/// simultaneously clears the low 6 bits (which the format mandates -/// are zero anyway; the shift makes mispacked input deterministic). -/// - Chroma bias is 512 (10‑bit center) after the shift. +/// - Each `u16` load is **right‑shifted by `16 - BITS`** — 6 for +/// P010, 4 for P012 — extracting the `BITS` active bits from the +/// high bits of each `u16` and clearing the low bits. The shift +/// runs via `vshlq_u16` with a negative loop‑invariant count so a +/// single kernel serves all supported bit depths. /// /// After the shift, the rest of the pipeline is identical to the -/// `yuv420p10` path — same `chroma_i16x8` / `scale_y` / `chroma_dup` -/// / `vst3q_u8` write, with `range_params_n::<10, 8>` scaling. +/// low‑bit‑packed planar path — same `chroma_i16x8` / `scale_y` / +/// `chroma_dup` / `vst3q_u8` write, with `range_params_n::` +/// scaling. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] across all +/// supported `BITS` values. /// /// # Safety /// @@ -515,9 +526,10 @@ fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// 4. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -531,8 +543,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: NEON availability is the caller's obligation. @@ -542,6 +554,9 @@ pub(crate) unsafe fn p010_to_rgb_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); + // `vshlq_u16` performs right shift when the count is negative. + // Count = -(16 - BITS) extracts the `BITS` active high bits. + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); let cgu = vdupq_n_s32(coeffs.g_u()); @@ -551,17 +566,16 @@ pub(crate) unsafe fn p010_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - // 16 Y pixels in two u16x8 loads, shifted right by 6 to extract - // the 10‑bit values from P010's high‑bit packing. - let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); - let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + // 16 Y pixels in two u16x8 loads, right-shifted by 16-BITS to + // extract the active bits from the high-bit packing. + let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count); + let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count); // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements - // and returns (evens, odds) = (U, V) in one shot. Each gets the - // same `>> 6` shift as Y. + // and returns (evens, odds) = (U, V) in one shot. let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = vshrq_n_u16::<6>(uv_pair.0); - let v_vec = vshrq_n_u16::<6>(uv_pair.1); + let u_vec = vshlq_u16(uv_pair.0, shr_count); + let v_vec = vshlq_u16(uv_pair.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -613,7 +627,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -625,20 +639,24 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// NEON P010 → packed **10‑bit `u16`** RGB (native‑depth, low‑bit‑ -/// packed output — `yuv420p10le` convention, not P010). +/// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, +/// `yuv420p10le` / `yuv420p12le` convention — not P010/P012). /// -/// Same structure as [`p010_to_rgb_row`] up to the chroma compute; -/// the only differences are: -/// - `range_params_n::<10, 10>` → larger scales targeting the 10‑bit -/// output range. -/// - Clamp is explicit min/max to `[0, 1023]` via -/// [`clamp_u10`](crate::row::arch::neon::clamp_u10). +/// Same structure as [`super::neon::p_n_to_rgb_row`] up to the +/// chroma compute; the only differences are: +/// - `range_params_n::` → larger scales targeting the +/// native‑depth output range. +/// - Clamp is explicit min/max to `[0, (1 << BITS) - 1]` via +/// [`clamp_u10`](crate::row::arch::neon::clamp_u10) — the helper +/// name is historical; the actual max is derived from `BITS` at +/// the call site (1023 for P010, 4095 for P012). /// - Writes use two `vst3q_u16` calls per 16‑pixel block. /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -648,7 +666,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -662,10 +680,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: NEON availability is the caller's obligation. unsafe { @@ -674,7 +692,8 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = vdupq_n_s32(y_scale); let c_scale_v = vdupq_n_s32(c_scale); let bias_v = vdupq_n_s16(bias as i16); - let max_v = vdupq_n_s16(OUT_MAX_10); + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + let max_v = vdupq_n_s16(out_max); let zero_v = vdupq_n_s16(0); let cru = vdupq_n_s32(coeffs.r_u()); let crv = vdupq_n_s32(coeffs.r_v()); @@ -685,11 +704,11 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); - let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count); + let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count); let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = vshrq_n_u16::<6>(uv_pair.0); - let v_vec = vshrq_n_u16::<6>(uv_pair.1); + let u_vec = vshlq_u16(uv_pair.0, shr_count); + let v_vec = vshlq_u16(uv_pair.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -737,7 +756,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1705,7 +1724,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -1730,7 +1749,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -1851,7 +1870,7 @@ mod tests { full_range, ); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -1870,7 +1889,7 @@ mod tests { full_range, ); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -1913,9 +1932,9 @@ mod tests { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -1938,9 +1957,9 @@ mod tests { let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -2036,9 +2055,9 @@ mod tests { for full_range in [true, false] { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -2047,9 +2066,9 @@ mod tests { let mut rgb16_scalar = std::vec![0u16; width * 3]; let mut rgb16_neon = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb16_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -2059,4 +2078,156 @@ mod tests { } } } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON planar {BITS}-bit → u8 diverges"); + } + + fn check_planar_u16_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_neon, + "NEON planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_neon_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn neon_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_neon_equivalence_n::<12>(16, m, full); + check_planar_u16_neon_equivalence_n::<12>(16, m, full); + check_pn_u8_neon_equivalence_n::<12>(16, m, full); + check_pn_u16_neon_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn neon_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_neon_equivalence_n::<14>(16, m, full); + check_planar_u16_neon_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn neon_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_neon_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_neon_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn neon_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_neon_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_neon_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 21efa7c..2a54fbd 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -214,7 +214,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -230,8 +230,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: simd128 compile‑time availability is the caller's @@ -242,7 +242,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let mask_v = u16x8_splat(scalar::bits_mask::<10>()); + let mask_v = u16x8_splat(scalar::bits_mask::()); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); let cgu = i32x4_splat(coeffs.g_u()); @@ -303,7 +303,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -334,7 +334,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -350,10 +350,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: simd128 compile‑time availability is the caller's // obligation. @@ -363,8 +363,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let mask_v = u16x8_splat(scalar::bits_mask::<10>()); - let max_v = i16x8_splat(OUT_MAX_10); + let mask_v = u16x8_splat(scalar::bits_mask::()); + let max_v = i16x8_splat(out_max); let zero_v = i16x8_splat(0); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); @@ -424,7 +424,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -500,18 +500,22 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { } } -/// WASM simd128 P010 → packed **8‑bit** RGB. +/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → +/// packed **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`u16x8_shr(_, 6)`) instead of -/// AND‑masked. +/// [`super::wasm_simd128::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`u16x8_shr`, with +/// the shift amount computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_wasm`] /// (two `u8x16_swizzle` + two `i8x16_shuffle` combines). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -521,7 +525,7 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -535,8 +539,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: simd128 compile‑time availability is the caller's @@ -554,13 +558,16 @@ pub(crate) unsafe fn p010_to_rgb_row( let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr = (16 - BITS) as u32; + let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); - let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); - let u_vec = u16x8_shr(u_vec, 6); - let v_vec = u16x8_shr(v_vec, 6); + let u_vec = u16x8_shr(u_vec, shr); + let v_vec = u16x8_shr(v_vec, shr); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -606,7 +613,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -618,12 +625,14 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// WASM simd128 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → +/// packed **native‑depth `u16`** RGB (low‑bit‑packed output, +/// `yuv420pNle` convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -633,7 +642,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -647,10 +656,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: simd128 compile‑time availability is the caller's // obligation. @@ -660,7 +669,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = i32x4_splat(y_scale); let c_scale_v = i32x4_splat(c_scale); let bias_v = i16x8_splat(bias as i16); - let max_v = i16x8_splat(OUT_MAX_10); + let max_v = i16x8_splat(out_max); let zero_v = i16x8_splat(0); let cru = i32x4_splat(coeffs.r_u()); let crv = i32x4_splat(coeffs.r_v()); @@ -669,13 +678,16 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr = (16 - BITS) as u32; + let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); - let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); - let u_vec = u16x8_shr(u_vec, 6); - let v_vec = u16x8_shr(v_vec, 6); + let u_vec = u16x8_shr(u_vec, shr); + let v_vec = u16x8_shr(v_vec, shr); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -719,7 +731,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1611,7 +1623,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1636,7 +1648,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1724,9 +1736,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges"); } @@ -1738,9 +1750,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges"); } @@ -1790,4 +1802,159 @@ mod tests { check_p010_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "simd128 planar {BITS}-bit → u8 diverges" + ); + } + + fn check_planar_u16_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "simd128 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_simd128_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn simd128_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_simd128_equivalence_n::<12>(16, m, full); + check_planar_u16_simd128_equivalence_n::<12>(16, m, full); + check_pn_u8_simd128_equivalence_n::<12>(16, m, full); + check_pn_u16_simd128_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn simd128_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_simd128_equivalence_n::<14>(16, m, full); + check_planar_u16_simd128_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn simd128_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_simd128_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_simd128_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn simd128_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_simd128_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_simd128_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index e5b6db7..10258c2 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -39,12 +39,13 @@ //! element order. Every fixup is called out inline. use core::arch::x86_64::{ - __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_and_si256, - _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, - _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32, - _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16, - _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srli_epi16, - _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, + __m256i, _mm_cvtsi32_si128, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, + _mm256_and_si256, _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, + _mm256_extracti128_si256, _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, + _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256, + _mm256_permute4x64_epi64, _mm256_set1_epi16, _mm256_set1_epi32, _mm256_setr_epi8, + _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srl_epi16, _mm256_sub_epi16, + _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, }; use crate::{ @@ -234,7 +235,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -250,8 +251,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX2 availability is the caller's obligation. @@ -261,7 +262,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -338,7 +339,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -375,7 +376,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -391,10 +392,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX2 availability is the caller's obligation. unsafe { @@ -403,8 +404,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); - let max_v = _mm256_set1_epi16(OUT_MAX_10); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::() as i16); + let max_v = _mm256_set1_epi16(out_max); let zero_v = _mm256_set1_epi16(0); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); @@ -505,7 +506,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -526,19 +527,23 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) } } -/// AVX2 P010 → packed **8‑bit** RGB. +/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 32 Y pixels / 16 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`_mm256_srli_epi16::<6>`) -/// instead of AND‑masked. +/// [`super::x86_avx2::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`_mm256_srl_epi16`, +/// with a shift count computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx2`] /// (two `_mm256_shuffle_epi8` + two `_mm256_permute4x64_epi64` + /// two `_mm256_permute2x128_si256` per 32 chroma elements). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -548,7 +553,7 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -562,8 +567,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX2 availability is the caller's obligation. @@ -573,6 +578,8 @@ pub(crate) unsafe fn p010_to_rgb_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -582,14 +589,15 @@ pub(crate) unsafe fn p010_to_rgb_row( let mut x = 0usize; while x + 32 <= width { - // 32 Y = two u16×16 loads, shifted right by 6. - let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + // 32 Y = two u16×16 loads, shifted right by `16 - BITS`. + let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count); // 32 UV (16 pairs) — deinterleave + shift. let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); - let u_vec = _mm256_srli_epi16::<6>(u_vec); - let v_vec = _mm256_srli_epi16::<6>(v_vec); + let u_vec = _mm256_srl_epi16(u_vec, shr_count); + let v_vec = _mm256_srl_epi16(v_vec, shr_count); let u_i16 = _mm256_sub_epi16(u_vec, bias_v); let v_i16 = _mm256_sub_epi16(v_vec, bias_v); @@ -644,7 +652,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -656,12 +664,14 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// AVX2 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -671,7 +681,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -685,10 +695,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX2 availability is the caller's obligation. unsafe { @@ -697,8 +707,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = _mm256_set1_epi32(y_scale); let c_scale_v = _mm256_set1_epi32(c_scale); let bias_v = _mm256_set1_epi16(bias as i16); - let max_v = _mm256_set1_epi16(OUT_MAX_10); + let max_v = _mm256_set1_epi16(out_max); let zero_v = _mm256_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm256_set1_epi32(coeffs.r_u()); let crv = _mm256_set1_epi32(coeffs.r_v()); let cgu = _mm256_set1_epi32(coeffs.g_u()); @@ -708,11 +720,12 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { - let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); - let u_vec = _mm256_srli_epi16::<6>(u_vec); - let v_vec = _mm256_srli_epi16::<6>(v_vec); + let u_vec = _mm256_srl_epi16(u_vec, shr_count); + let v_vec = _mm256_srl_epi16(v_vec, shr_count); let u_i16 = _mm256_sub_epi16(u_vec, bias_v); let v_i16 = _mm256_sub_epi16(v_vec, bias_v); @@ -787,7 +800,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1610,7 +1623,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1638,7 +1651,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1729,9 +1742,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges"); } @@ -1746,9 +1759,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges"); } @@ -1798,4 +1811,168 @@ mod tests { check_p010_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 planar {BITS}-bit → u8 diverges"); + } + + fn check_planar_u16_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX2 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_avx2_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn avx2_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx2_equivalence_n::<12>(32, m, full); + check_planar_u16_avx2_equivalence_n::<12>(32, m, full); + check_pn_u8_avx2_equivalence_n::<12>(32, m, full); + check_pn_u16_avx2_equivalence_n::<12>(32, m, full); + } + } + } + + #[test] + fn avx2_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx2_equivalence_n::<14>(32, m, full); + check_planar_u16_avx2_equivalence_n::<14>(32, m, full); + } + } + } + + #[test] + fn avx2_p12_matches_scalar_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_planar_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_avx2_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx2_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn avx2_p14_matches_scalar_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_planar_u8_avx2_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx2_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 19caeaa..3925276 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -53,13 +53,13 @@ //! two 32‑Y‑block‑aligned vectors from unpacklo + unpackhi. use core::arch::x86_64::{ - __m128i, __m512i, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, _mm512_adds_epi16, - _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256, - _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, + __m128i, __m512i, _mm_cvtsi32_si128, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, + _mm512_adds_epi16, _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, + _mm512_castsi512_si256, _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16, _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64, _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64, - _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srli_epi16, _mm512_sub_epi16, + _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srl_epi16, _mm512_sub_epi16, _mm512_unpackhi_epi16, _mm512_unpacklo_epi16, }; @@ -251,7 +251,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -267,8 +267,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX‑512BW availability is the caller's obligation. @@ -278,7 +278,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -358,7 +358,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -391,7 +391,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -407,10 +407,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX‑512BW availability is the caller's obligation. unsafe { @@ -419,8 +419,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); - let max_v = _mm512_set1_epi16(OUT_MAX_10); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::() as i16); + let max_v = _mm512_set1_epi16(out_max); let zero_v = _mm512_set1_epi16(0); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); @@ -508,7 +508,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -572,12 +572,15 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u } } -/// AVX‑512 P010 → packed **8‑bit** RGB. +/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 64 Y pixels / 32 chroma pairs per iteration. Mirrors -/// [`yuv420p10_to_rgb_row`] with two structural differences: -/// - Samples are shifted right by 6 (`_mm512_srli_epi16::<6>`) -/// instead of AND‑masked. +/// [`super::x86_avx512::yuv_420p_n_to_rgb_row`] with two structural +/// differences: +/// - Samples are shifted right by `16 - BITS` (`_mm512_srl_epi16`, +/// with a shift count computed from `BITS` once per call) instead +/// of AND‑masked. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx512`] /// — per‑128‑lane shuffle + 64‑bit permute + cross‑vector /// `_mm512_permutex2var_epi64` to produce 32‑sample U and V @@ -585,7 +588,8 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -595,7 +599,7 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -609,8 +613,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: AVX‑512BW availability is the caller's obligation. @@ -620,6 +624,8 @@ pub(crate) unsafe fn p010_to_rgb_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -633,11 +639,12 @@ pub(crate) unsafe fn p010_to_rgb_row( let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); - let u_vec = _mm512_srli_epi16::<6>(u_vec); - let v_vec = _mm512_srli_epi16::<6>(v_vec); + let u_vec = _mm512_srl_epi16(u_vec, shr_count); + let v_vec = _mm512_srl_epi16(v_vec, shr_count); let u_i16 = _mm512_sub_epi16(u_vec, bias_v); let v_i16 = _mm512_sub_epi16(v_vec, bias_v); @@ -692,7 +699,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -704,12 +711,14 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// AVX‑512 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed -/// `yuv420p10le` convention). +/// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -719,7 +728,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -733,10 +742,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: AVX‑512BW availability is the caller's obligation. unsafe { @@ -745,8 +754,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = _mm512_set1_epi32(y_scale); let c_scale_v = _mm512_set1_epi32(c_scale); let bias_v = _mm512_set1_epi16(bias as i16); - let max_v = _mm512_set1_epi16(OUT_MAX_10); + let max_v = _mm512_set1_epi16(out_max); let zero_v = _mm512_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm512_set1_epi32(coeffs.r_u()); let crv = _mm512_set1_epi32(coeffs.r_v()); let cgu = _mm512_set1_epi32(coeffs.g_u()); @@ -760,11 +771,12 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = + _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); - let u_vec = _mm512_srli_epi16::<6>(u_vec); - let v_vec = _mm512_srli_epi16::<6>(v_vec); + let u_vec = _mm512_srl_epi16(u_vec, shr_count); + let v_vec = _mm512_srl_epi16(v_vec, shr_count); let u_i16 = _mm512_sub_epi16(u_vec, bias_v); let v_i16 = _mm512_sub_epi16(v_vec, bias_v); @@ -823,7 +835,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -1647,7 +1659,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1675,7 +1687,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1766,9 +1778,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges"); } @@ -1783,9 +1795,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges"); } @@ -1835,4 +1847,171 @@ mod tests { check_p010_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX-512 planar {BITS}-bit → u8 diverges" + ); + } + + fn check_planar_u16_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "AVX-512 planar {BITS}-bit → u16 diverges" + ); + } + + fn check_pn_u8_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_avx512_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn avx512_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx512_equivalence_n::<12>(64, m, full); + check_planar_u16_avx512_equivalence_n::<12>(64, m, full); + check_pn_u8_avx512_equivalence_n::<12>(64, m, full); + check_pn_u16_avx512_equivalence_n::<12>(64, m, full); + } + } + } + + #[test] + fn avx512_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx512_equivalence_n::<14>(64, m, full); + check_planar_u16_avx512_equivalence_n::<14>(64, m, full); + } + } + } + + #[test] + fn avx512_p12_matches_scalar_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_planar_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_avx512_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx512_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn avx512_p14_matches_scalar_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_planar_u8_avx512_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx512_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 66e385b..75796bf 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -37,10 +37,10 @@ use core::arch::x86_64::{ __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, - _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32, - _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, - _mm_srai_epi32, _mm_srli_epi16, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, - _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64, + _mm_cvtsi32_si128, _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, + _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, + _mm_setr_epi8, _mm_shuffle_epi8, _mm_srai_epi32, _mm_srl_epi16, _mm_srli_si128, _mm_sub_epi16, + _mm_unpackhi_epi16, _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64, }; use crate::{ @@ -193,12 +193,14 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } -/// SSE4.1 P010 → packed **8‑bit** RGB. +/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **8‑bit** RGB. /// /// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences -/// from [`yuv420p10_to_rgb_row`]: -/// - Samples are shifted right by 6 (`_mm_srli_epi16::<6>`) instead -/// of AND‑masked — P010's 10 active bits live in the HIGH 10 of +/// from [`super::x86_sse41::yuv_420p_n_to_rgb_row`]: +/// - Samples are shifted right by `16 - BITS` (`_mm_srl_epi16`, with +/// a shift count computed from `BITS` once per call) instead of +/// AND‑masked — Pn's `BITS` active bits live in the HIGH `BITS` of /// each `u16`. /// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16`] /// below (one `_mm_shuffle_epi8` + two 64‑bit unpacks per 16 @@ -206,7 +208,8 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -216,7 +219,7 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p010_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -230,8 +233,8 @@ pub(crate) unsafe fn p010_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: SSE4.1 availability is the caller's obligation. @@ -241,6 +244,10 @@ pub(crate) unsafe fn p010_to_rgb_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); + // High-bit-packed samples: shift right by `16 - BITS` to extract + // the BITS-bit value. Loop-invariant, loaded once into the low 64b + // of `shr_count` for `_mm_srl_epi16`. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -250,15 +257,15 @@ pub(crate) unsafe fn p010_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - // Y: two u16×8 loads, each shifted right by 6. - let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + // Y: two u16×8 loads, each shifted right by `16 - BITS`. + let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count); // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then // deinterleave into separate u_vec + v_vec. let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); - let u_vec = _mm_srli_epi16::<6>(u_vec); - let v_vec = _mm_srli_epi16::<6>(v_vec); + let u_vec = _mm_srl_epi16(u_vec, shr_count); + let v_vec = _mm_srl_epi16(v_vec, shr_count); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -304,7 +311,7 @@ pub(crate) unsafe fn p010_to_rgb_row( } if x < width { - scalar::p010_to_rgb_row( + scalar::p_n_to_rgb_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -316,12 +323,14 @@ pub(crate) unsafe fn p010_to_rgb_row( } } -/// SSE4.1 P010 → packed **10‑bit `u16`** RGB (native‑depth, -/// low‑bit‑packed — `yuv420p10le` convention). +/// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed +/// **native‑depth `u16`** RGB (low‑bit‑packed output, `yuv420pNle` +/// convention). /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS`. /// /// # Safety /// @@ -331,7 +340,7 @@ pub(crate) unsafe fn p010_to_rgb_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p010_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -345,10 +354,10 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: SSE4.1 availability is the caller's obligation. unsafe { @@ -357,8 +366,11 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let max_v = _mm_set1_epi16(OUT_MAX_10); + let max_v = _mm_set1_epi16(out_max); let zero_v = _mm_set1_epi16(0); + // High-bit-packed samples: shift right by `16 - BITS` to extract + // the BITS-bit value. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -368,11 +380,11 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); - let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count); + let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count); let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); - let u_vec = _mm_srli_epi16::<6>(u_vec); - let v_vec = _mm_srli_epi16::<6>(v_vec); + let u_vec = _mm_srl_epi16(u_vec, shr_count); + let v_vec = _mm_srl_epi16(v_vec, shr_count); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -415,7 +427,7 @@ pub(crate) unsafe fn p010_to_rgb_u16_row( } if x < width { - scalar::p010_to_rgb_u16_row( + scalar::p_n_to_rgb_u16_row::( &y[x..width], &uv_half[x..width], &mut rgb_out[x * 3..width * 3], @@ -486,7 +498,7 @@ unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) { /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv420p10_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -502,8 +514,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; // SAFETY: SSE4.1 availability is the caller's obligation; the @@ -516,7 +528,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); + let mask_v = _mm_set1_epi16(scalar::bits_mask::() as i16); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); let cgu = _mm_set1_epi32(coeffs.g_u()); @@ -579,7 +591,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( } if x < width { - scalar::yuv_420p_n_to_rgb_row::<10>( + scalar::yuv_420p_n_to_rgb_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -614,7 +626,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_row( /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -630,10 +642,10 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3); let coeffs = scalar::Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); - let bias = scalar::chroma_bias::<10>(); + let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); + let bias = scalar::chroma_bias::(); const RND: i32 = 1 << 14; - const OUT_MAX_10: i16 = 1023; + let out_max: i16 = ((1i32 << BITS) - 1) as i16; // SAFETY: SSE4.1 availability is the caller's obligation. unsafe { @@ -642,8 +654,8 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( let y_scale_v = _mm_set1_epi32(y_scale); let c_scale_v = _mm_set1_epi32(c_scale); let bias_v = _mm_set1_epi16(bias as i16); - let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); - let max_v = _mm_set1_epi16(OUT_MAX_10); + let mask_v = _mm_set1_epi16(scalar::bits_mask::() as i16); + let max_v = _mm_set1_epi16(out_max); let zero_v = _mm_set1_epi16(0); let cru = _mm_set1_epi32(coeffs.r_u()); let crv = _mm_set1_epi32(coeffs.r_v()); @@ -708,7 +720,7 @@ pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], @@ -1389,7 +1401,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1417,7 +1429,7 @@ mod tests { scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -1508,9 +1520,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges"); } @@ -1525,9 +1537,9 @@ mod tests { let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges"); } @@ -1577,4 +1589,180 @@ mod tests { check_p010_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false); check_p010_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- Generic BITS equivalence (12/14-bit coverage) ------------------ + // + // The helpers below parameterize over `const BITS: u32` so the same + // scalar-equivalence scaffolding covers 10/12/14 without duplicating + // the 16-pixel block seeding + diff harness. `<10>` is already + // exercised by the dedicated tests above; `<12>` / `<14>` add + // regression coverage for the new yuv420p12 / yuv420p14 / P012 + // kernels. 14-bit is planar-only (no P014 in Ship 4a). + + fn planar_n_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + (0..n) + .map(|i| ((i * seed + seed * 3) as u32 & mask) as u16) + .collect() + } + + fn p_n_packed_plane(n: usize, seed: usize) -> std::vec::Vec { + let mask = (1u32 << BITS) - 1; + let shift = 16 - BITS; + (0..n) + .map(|i| (((i * seed + seed * 3) as u32 & mask) as u16) << shift) + .collect() + } + + fn check_planar_u8_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "SSE4.1 planar {BITS}-bit → u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + fn check_planar_u16_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_simd, + "SSE4.1 planar {BITS}-bit → u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + fn check_pn_u8_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u8 diverges"); + } + + fn check_pn_u16_sse41_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, + ) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u16 diverges"); + } + + #[test] + fn sse41_p12_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_sse41_equivalence_n::<12>(16, m, full); + check_planar_u16_sse41_equivalence_n::<12>(16, m, full); + check_pn_u8_sse41_equivalence_n::<12>(16, m, full); + check_pn_u16_sse41_equivalence_n::<12>(16, m, full); + } + } + } + + #[test] + fn sse41_p14_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_sse41_equivalence_n::<14>(16, m, full); + check_planar_u16_sse41_equivalence_n::<14>(16, m, full); + } + } + } + + #[test] + fn sse41_p12_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_planar_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + check_pn_u8_sse41_equivalence_n::<12>(w, ColorMatrix::Bt601, false); + check_pn_u16_sse41_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + } + } + + #[test] + fn sse41_p14_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_planar_u8_sse41_equivalence_n::<14>(w, ColorMatrix::Bt601, false); + check_planar_u16_sse41_equivalence_n::<14>(w, ColorMatrix::Bt709, true); + } + } } diff --git a/src/row/mod.rs b/src/row/mod.rs index 80afab7..1201ac6 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -350,7 +350,7 @@ pub fn yuv420p10_to_rgb_row( // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). unsafe { - arch::neon::yuv420p10_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -359,7 +359,7 @@ pub fn yuv420p10_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv420p10_to_rgb_row( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -368,7 +368,7 @@ pub fn yuv420p10_to_rgb_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv420p10_to_rgb_row( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -377,7 +377,7 @@ pub fn yuv420p10_to_rgb_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv420p10_to_rgb_row( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -388,7 +388,7 @@ pub fn yuv420p10_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv420p10_to_rgb_row( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -444,7 +444,7 @@ pub fn yuv420p10_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv420p10_to_rgb_u16_row( + arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -455,7 +455,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv420p10_to_rgb_u16_row( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -464,7 +464,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv420p10_to_rgb_u16_row( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -473,7 +473,7 @@ pub fn yuv420p10_to_rgb_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv420p10_to_rgb_u16_row( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -484,7 +484,7 @@ pub fn yuv420p10_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv420p10_to_rgb_u16_row( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -504,7 +504,7 @@ pub fn yuv420p10_to_rgb_u16_row( /// /// This is the HDR hardware‑decode keystone format: VideoToolbox, /// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit -/// output. See `scalar::p010_to_rgb_row` for the full semantic +/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic /// specification. `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -529,7 +529,7 @@ pub fn p010_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -538,21 +538,21 @@ pub fn p010_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -561,7 +561,7 @@ pub fn p010_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -570,7 +570,7 @@ pub fn p010_to_rgb_row( } } - scalar::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P010** to **native‑depth `u16`** packed RGB @@ -579,7 +579,7 @@ pub fn p010_to_rgb_row( /// Callers feeding this output into a P010 consumer must shift left /// by 6. /// -/// See `scalar::p010_to_rgb_u16_row` for the full spec. +/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. /// `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -604,7 +604,7 @@ pub fn p010_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -613,21 +613,21 @@ pub fn p010_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -636,7 +636,7 @@ pub fn p010_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p010_to_rgb_u16_row( + arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( y, uv_half, rgb_out, width, matrix, full_range, ); } @@ -647,7 +647,452 @@ pub fn p010_to_rgb_u16_row( } } - scalar::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 12 active bits in the low 12 bits of each +/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed +/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The +/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching +/// `yuv420p12le` convention — upper 4 bits zero). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (14‑bit values in the low 14 of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ +/// packed — 12 active bits in the high 12 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and +/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but +/// `>> 4` instead of `>> 6` at each `u16` load. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** to **native‑depth `u16`** packed RGB +/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed +/// `yuv420p12le` convention, **not** P012's high‑bit packing). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 527ea4d..8d45b48 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -207,6 +207,13 @@ pub(crate) fn yuv_420p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // Low-bit-packed planar kernels are defined for BITS in {10, 12, 14}. + // 16 would overflow the Q15 chroma sum; 8 belongs to the non- + // const-generic `yuv_420_to_rgb_row` family. + debug_assert!( + BITS == 10 || BITS == 12 || BITS == 14, + "yuv_420p_n_to_rgb_row only supports BITS in {{10, 12, 14}}" + ); debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); @@ -300,6 +307,12 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // Same BITS range as the u8-output counterpart. See + // `yuv_420p_n_to_rgb_row` for the rationale. + debug_assert!( + BITS == 10 || BITS == 12 || BITS == 14, + "yuv_420p_n_to_rgb_u16_row only supports BITS in {{10, 12, 14}}" + ); debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); @@ -347,16 +360,17 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( // ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------ /// Converts one row of P010 (semi‑planar 4:2:0 with UV interleaved, -/// 10 active bits in the **high** 10 of each `u16`) to **8‑bit** -/// packed RGB. +/// `BITS` active bits in the **high** `BITS` of each `u16`) to +/// **8‑bit** packed RGB. /// /// Structurally identical to [`nv12_to_rgb_row`] plus the per‑sample -/// shift: each `u16` load is extracted to its 10‑bit value via -/// `sample >> 6`, then the same Q15 pipeline as -/// [`yuv_420p_n_to_rgb_row`] runs with `BITS == 10`. Mispacked input -/// — e.g. a `yuv420p10le` buffer with values in the **low** 10 bits -/// — is masked down to a small positive number (producing near‑black -/// output) rather than silent garbage, matching every SIMD backend. +/// shift: each `u16` load is extracted to its `BITS`‑bit value via +/// `sample >> (16 - BITS)`, then the same Q15 pipeline as +/// [`yuv_420p_n_to_rgb_row`] runs with the same `BITS`. For `BITS == +/// 10` this is P010 (`>> 6`); for `BITS == 12` it's P012 (`>> 4`). +/// Mispacked input — e.g. a low‑bit‑packed buffer handed to this +/// kernel — has its active low bits discarded (producing near‑black +/// output), matching every SIMD backend. /// /// # Panics (debug builds) /// @@ -364,7 +378,7 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p010_to_rgb_row( +pub(crate) fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -372,26 +386,36 @@ pub(crate) fn p010_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - debug_assert_eq!(width & 1, 0, "P010 requires even width"); + // High-bit-packed Pn kernels are only defined for BITS in {10, 12}. + // Outside that set, `16 - BITS` could under/overflow and the Q15 + // coefficient table has no corresponding entry. Caught here before + // the SIMD dispatcher hands control to unsafe code. + debug_assert!( + BITS == 10 || BITS == 12, + "p_n_to_rgb_row only supports BITS in {{10, 12}}" + ); + debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); let coeffs = Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range); - let bias = chroma_bias::<10>(); - - // Each `u16` load is converted to its 10-bit sample with `>> 6`, - // extracting the upper 10 bits and leaving the result in - // `[0, 1023]`. If low-packed input (`yuv420p10le`) is handed to - // this kernel by mistake, that shift discards the active low 6 bits - // rather than recovering the intended 10-bit value. No hot-path - // cost: one shift per load. + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let shift = 16 - BITS; + + // Each `u16` load is converted to its `BITS`-bit sample with + // `>> (16 - BITS)` — 6 for P010, 4 for P012. Extracts the upper + // bits and leaves the result in `[0, (1 << BITS) - 1]`. If + // low-packed input (`yuv420p10le`, `yuv420p12le`) is handed to + // this kernel by mistake, the shift discards the active low bits + // rather than recovering the intended value. No hot-path cost: + // one shift per load. let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> 6; - let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_sample = uv_half[c_idx * 2] >> shift; + let v_sample = uv_half[c_idx * 2 + 1] >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -399,12 +423,12 @@ pub(crate) fn p010_to_rgb_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); rgb_out[x * 3] = clamp_u8(y0 + r_chroma); rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); - let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); @@ -413,15 +437,18 @@ pub(crate) fn p010_to_rgb_row( } } -/// Converts one row of P010 to **native‑depth `u16`** packed RGB -/// (10 active bits in the low bits of each `u16`, matching -/// `yuv420p10le` convention — **not** P010's high‑bit packing). +/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 +/// (`BITS` ∈ {10, 12}: P010, P012) to **native‑depth `u16`** +/// packed RGB — samples are **low‑bit‑packed** on output +/// (`[0, (1 << BITS) - 1]` in the low bits of each `u16`, upper bits +/// zero), matching the `yuv420p10le` / `yuv420p12le` convention — +/// **not** the P010/P012 high‑bit packing. Callers feeding a P010/ +/// P012 consumer must shift the output left by `16 - BITS`. /// -/// Mirrors [`yuv_420p_n_to_rgb_u16_row::<10>`] on the math side; the -/// only difference is the input shift (`sample >> 6` instead of -/// `sample & 0x3FF`) and the UV deinterleave. Output is suitable for -/// direct consumption by downstream `yuv420p10le`‑shaped tooling. If -/// you need P010‑packed RGB output, shift left by 6 on the caller. +/// Mirrors [`yuv_420p_n_to_rgb_u16_row`] on the math side; the only +/// differences are the input shift (`sample >> (16 - BITS)` to +/// extract the `BITS`-bit value from the high-bit packing) and the +/// interleaved UV layout. /// /// # Panics (debug builds) /// @@ -429,7 +456,7 @@ pub(crate) fn p010_to_rgb_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p010_to_rgb_u16_row( +pub(crate) fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -437,21 +464,28 @@ pub(crate) fn p010_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - debug_assert_eq!(width & 1, 0, "P010 requires even width"); + // See `p_n_to_rgb_row` for the BITS range rationale. Duplicated + // here so either entry point catches misuse on its own. + debug_assert!( + BITS == 10 || BITS == 12, + "p_n_to_rgb_u16_row only supports BITS in {{10, 12}}" + ); + debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); let coeffs = Coefficients::for_matrix(matrix); - let (y_off, y_scale, c_scale) = range_params_n::<10, 10>(full_range); - let bias = chroma_bias::<10>(); - let out_max: i32 = (1i32 << 10) - 1; + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let out_max: i32 = (1i32 << BITS) - 1; + let shift = 16 - BITS; let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> 6; - let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_sample = uv_half[c_idx * 2] >> shift; + let v_sample = uv_half[c_idx * 2 + 1] >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -459,12 +493,12 @@ pub(crate) fn p010_to_rgb_u16_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; - let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; @@ -1131,7 +1165,7 @@ mod tests { let y = [0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1 let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); } @@ -1141,7 +1175,7 @@ mod tests { let y = [0xFFC0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); } @@ -1151,7 +1185,7 @@ mod tests { let y = [0x8000u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); assert_eq!(r, g); @@ -1167,7 +1201,7 @@ mod tests { let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); + p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); @@ -1196,7 +1230,7 @@ mod tests { ColorMatrix::Bt709, true, ); - p010_to_rgb_row( + p_n_to_rgb_row::<10>( &y_p010, &uv_p010, &mut rgb_p010, @@ -1214,7 +1248,7 @@ mod tests { let y = [0xFFC0u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u16; 12]; - p010_to_rgb_u16_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); } @@ -1223,7 +1257,7 @@ mod tests { let y = [0x1000u16, 0xEB00]; let uv = [0x8000u16, 0x8000]; let mut rgb = [0u16; 6]; - p010_to_rgb_u16_row(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); + p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 92835a5..4d210a7 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -2,9 +2,17 @@ //! written into my own buffers" consumer. //! //! Generic over the source format via an `F: SourceFormat` type -//! parameter. One `PixelSink` impl per supported format; v0.1 ships -//! the [`Yuv420p`](crate::yuv::Yuv420p), -//! [`Nv12`](crate::yuv::Nv12), and [`Nv21`](crate::yuv::Nv21) impls. +//! parameter. One `PixelSink` impl per supported format. Currently +//! ships impls for: +//! +//! - 8‑bit 4:2:0: [`Yuv420p`](crate::yuv::Yuv420p), +//! [`Nv12`](crate::yuv::Nv12), [`Nv21`](crate::yuv::Nv21). +//! - 10/12/14‑bit planar 4:2:0: [`Yuv420p10`](crate::yuv::Yuv420p10), +//! [`Yuv420p12`](crate::yuv::Yuv420p12), +//! [`Yuv420p14`](crate::yuv::Yuv420p14). +//! - 10/12‑bit semi‑planar high‑bit‑packed 4:2:0: +//! [`P010`](crate::yuv::P010), [`P012`](crate::yuv::P012). +//! //! All configuration and processing methods are fallible — no panics //! under normal contract violations — so the sink is usable on //! `panic = "abort"` targets. @@ -19,12 +27,15 @@ use thiserror::Error; use crate::{ HsvBuffers, PixelSink, SourceFormat, row::{ - nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, rgb_to_hsv_row, - yuv_420_to_rgb_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, + nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, + p012_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row, + yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, + yuv420p14_to_rgb_u16_row, }, yuv::{ - Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, Yuv420p, Yuv420p10, - Yuv420p10Row, Yuv420p10Sink, Yuv420pRow, Yuv420pSink, + Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, P012, P012Row, + P012Sink, Yuv420p, Yuv420p10, Yuv420p10Row, Yuv420p10Sink, Yuv420p12, Yuv420p12Row, + Yuv420p12Sink, Yuv420p14, Yuv420p14Row, Yuv420p14Sink, Yuv420pRow, Yuv420pSink, }, }; @@ -225,6 +236,39 @@ pub enum RowSlice { /// bits sit in the high 10 of its `u16`). #[display("UV Half 10")] UvHalf10, + /// Full‑width Y row of a **12‑bit** source — used for both the + /// planar ([`Yuv420p12`], low‑bit‑packed) and semi‑planar + /// ([`P012`], high‑bit‑packed) families. `u16` samples, `width` + /// elements. The packing direction depends on the source format; + /// the row‑shape check only verifies length, so a single variant + /// covers both. + #[display("Y12")] + Y12, + /// Half‑width U row of a **12‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("U Half 12")] + UHalf12, + /// Half‑width V row of a **12‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("V Half 12")] + VHalf12, + /// Half‑width interleaved UV row of a **12‑bit semi‑planar** source + /// ([`P012`]). `u16` samples, `width` elements (high‑bit‑packed: 12 + /// active bits in the high 12 of each `u16`). + #[display("UV Half 12")] + UvHalf12, + /// Full‑width Y row of a **14‑bit** planar source ([`Yuv420p14`]). + /// `u16` samples, `width` elements, low‑bit‑packed. + #[display("Y14")] + Y14, + /// Half‑width U row of a **14‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("U Half 14")] + UHalf14, + /// Half‑width V row of a **14‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("V Half 14")] + VHalf14, } /// A sink that writes any subset of `{RGB, Luma, HSV}` into @@ -244,10 +288,9 @@ pub enum RowSlice { /// # Type parameter /// /// `F` identifies the source format — `Yuv420p`, `Nv12`, `Nv21`, -/// `Bgr24`, etc. Each format provides its own -/// `impl PixelSink for MixedSinker<'_, F>`. v0.1 ships impls for -/// [`Yuv420p`](crate::yuv::Yuv420p), [`Nv12`](crate::yuv::Nv12), and -/// [`Nv21`](crate::yuv::Nv21). +/// `Yuv420p10`, `Yuv420p12`, `Yuv420p14`, `P010`, `P012`, etc. Each +/// format provides its own `impl PixelSink for MixedSinker<'_, F>`. +/// See the module‑level docs for the full list of shipped impls. pub struct MixedSinker<'a, F: SourceFormat> { rgb: Option<&'a mut [u8]>, rgb_u16: Option<&'a mut [u16]>, @@ -1297,180 +1340,724 @@ impl PixelSink for MixedSinker<'_, P010> { } } -/// Returns `Ok(())` iff the walker's frame dimensions exactly match -/// the sinker's configured dimensions. Called from -/// [`PixelSink::begin_frame`] on both `MixedSinker` and -/// `MixedSinker`. -/// -/// The sinker's RGB / luma / HSV buffers were sized for -/// `configured_w × configured_h`. A shorter frame would silently -/// leave the bottom rows of those buffers stale from the previous -/// frame; a taller frame would overrun them. Either is a real -/// failure mode, but neither is a panic-worthy bug — the caller can -/// recover by rebuilding the sinker. Returning `Err` before any row -/// is processed guarantees no partial output. -#[cfg_attr(not(tarpaulin), inline(always))] -fn check_dimensions_match( - configured_w: usize, - configured_h: usize, - frame_w: u32, - frame_h: u32, -) -> Result<(), MixedSinkerError> { - let fw = frame_w as usize; - let fh = frame_h as usize; - if fw != configured_w || fh != configured_h { - return Err(MixedSinkerError::DimensionMismatch { - configured_w, - configured_h, - frame_w, - frame_h, - }); - } - Ok(()) -} - -#[cfg(all(test, feature = "std"))] -mod tests { - use super::*; - use crate::{ - ColorMatrix, - frame::{Nv12Frame, Nv21Frame, P010Frame, Yuv420p10Frame, Yuv420pFrame}, - yuv::{nv12_to, nv21_to, p010_to, yuv420p_to, yuv420p10_to}, - }; +// ---- Yuv420p12 impl ---------------------------------------------------- - fn solid_yuv420p_frame( - width: u32, - height: u32, - y: u8, - u: u8, - v: u8, - ) -> (Vec, Vec, Vec) { - let w = width as usize; - let h = height as usize; - let cw = w / 2; - let ch = h / 2; - ( - std::vec![y; w * h], - std::vec![u; cw * ch], - std::vec![v; cw * ch], - ) +impl<'a> MixedSinker<'a, Yuv420p12> { + /// Attaches a packed **`u16`** RGB output buffer. Mirrors + /// [`MixedSinker::with_rgb_u16`] but produces 12‑bit + /// output (values in `[0, 4095]` in the low 12 of each `u16`, upper + /// 4 zero). Length is measured in `u16` **elements** (`width × + /// height × 3`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) } - #[test] - fn luma_only_copies_y_plane() { - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 42, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); - - let mut luma = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_luma(&mut luma) - .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - - assert!(luma.iter().all(|&y| y == 42), "luma should be solid 42"); + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) } +} - #[test] - fn rgb_only_converts_gray_to_gray() { - // Neutral chroma → gray RGB; solid Y=128 → ~128 in every RGB byte. - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); +impl Yuv420p12Sink for MixedSinker<'_, Yuv420p12> {} - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) - .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); +impl PixelSink for MixedSinker<'_, Yuv420p12> { + type Input<'r> = Yuv420p12Row<'r>; + type Error = MixedSinkerError; - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); - assert_eq!(px[0], px[1]); - assert_eq!(px[1], px[2]); + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); } + check_dimensions_match(self.width, self.height, width, height) } - #[test] - fn hsv_only_allocates_scratch_and_produces_gray_hsv() { - // Neutral gray → H=0, S=0, V=~128. No RGB buffer provided. - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn process(&mut self, row: Yuv420p12Row<'_>) -> Result<(), Self::Error> { + // Bit depth is fixed by the format (12) — declared as a const so + // the downshift for u8 luma stays obvious at the call site. + const BITS: u32 = 12; - let mut h = std::vec![0xFFu8; 16 * 8]; - let mut s = std::vec![0xFFu8; 16 * 8]; - let mut v = std::vec![0xFFu8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_hsv(&mut h, &mut s, &mut v) - .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; - assert!(h.iter().all(|&b| b == 0)); - assert!(s.iter().all(|&b| b == 0)); - assert!(v.iter().all(|&b| b.abs_diff(128) <= 1)); - } + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y12, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.u_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf12, + row: idx, + expected: w / 2, + actual: row.u_half().len(), + }); + } + if row.v_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::VHalf12, + row: idx, + expected: w / 2, + actual: row.v_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } - #[test] - fn mixed_all_three_outputs_populated() { - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 200, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut luma = std::vec![0u8; 16 * 8]; - let mut h = std::vec![0u8; 16 * 8]; - let mut s = std::vec![0u8; 16 * 8]; - let mut v = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) - .unwrap() - .with_luma(&mut luma) - .unwrap() - .with_hsv(&mut h, &mut s, &mut v) - .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; - // Luma = Y plane verbatim. - assert!(luma.iter().all(|&y| y == 200)); - // RGB gray. - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(200) <= 1); + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> (BITS - 8)) as u8; + } } - // HSV of gray. - assert!(h.iter().all(|&b| b == 0)); - assert!(s.iter().all(|&b| b == 0)); - assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); - } - - #[test] - fn rgb_with_hsv_uses_user_buffer_not_scratch() { - // When caller provides RGB, the scratch should remain empty (Vec len 0). - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 100, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut h = std::vec![0u8; 16 * 8]; - let mut s = std::vec![0u8; 16 * 8]; - let mut v = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) - .unwrap() - .with_hsv(&mut h, &mut s, &mut v) - .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + yuv420p12_to_rgb_u16_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } - assert_eq!( - sink.rgb_scratch.len(), - 0, - "scratch should stay unallocated when RGB buffer is provided" - ); - } + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } - #[test] - fn with_simd_false_matches_with_simd_true() { - // A/B test: same frame, one sinker forces scalar, the other uses - // SIMD. NEON is bit‑exact to scalar so outputs must match. - let w = 32usize; - let h = 16usize; - let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200); - let src = Yuv420pFrame::new( + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + yuv420p12_to_rgb_row( + row.y(), + row.u_half(), + row.v_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + +// ---- Yuv420p14 impl ---------------------------------------------------- + +impl<'a> MixedSinker<'a, Yuv420p14> { + /// Attaches a packed **`u16`** RGB output buffer. Produces 14‑bit + /// output (values in `[0, 16383]` in the low 14 of each `u16`, upper + /// 2 zero). Length is measured in `u16` **elements** (`width × + /// height × 3`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl Yuv420p14Sink for MixedSinker<'_, Yuv420p14> {} + +impl PixelSink for MixedSinker<'_, Yuv420p14> { + type Input<'r> = Yuv420p14Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: Yuv420p14Row<'_>) -> Result<(), Self::Error> { + const BITS: u32 = 14; + + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y14, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.u_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf14, + row: idx, + expected: w / 2, + actual: row.u_half().len(), + }); + } + if row.v_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::VHalf14, + row: idx, + expected: w / 2, + actual: row.v_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> (BITS - 8)) as u8; + } + } + + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + yuv420p14_to_rgb_u16_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + yuv420p14_to_rgb_row( + row.y(), + row.u_half(), + row.v_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + +// ---- P012 impl --------------------------------------------------------- + +impl<'a> MixedSinker<'a, P012> { + /// Attaches a packed **`u16`** RGB output buffer. Produces 12‑bit + /// output in **low‑bit‑packed** `yuv420p12le` convention (values in + /// `[0, 4095]` in the low 12 of each `u16`, upper 4 zero) — + /// **not** P012's high‑bit packing. Callers feeding a P012 consumer + /// must shift the output left by 4. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl P012Sink for MixedSinker<'_, P012> {} + +impl PixelSink for MixedSinker<'_, P012> { + type Input<'r> = P012Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error> { + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y12, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.uv_half().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UvHalf12, + row: idx, + expected: w, + actual: row.uv_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + // Luma: P012 samples are high‑bit‑packed (`value << 4`). Taking + // the high byte via `>> 8` gives the top 8 bits of the 12‑bit + // value — identical accessor to P010 (both put active bits in the + // high `BITS` positions of the `u16`). + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> 8) as u8; + } + } + + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + p012_to_rgb_u16_row( + row.y(), + row.uv_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + p012_to_rgb_row( + row.y(), + row.uv_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + +/// Returns `Ok(())` iff the walker's frame dimensions exactly match +/// the sinker's configured dimensions. Called from +/// [`PixelSink::begin_frame`] on both `MixedSinker` and +/// `MixedSinker`. +/// +/// The sinker's RGB / luma / HSV buffers were sized for +/// `configured_w × configured_h`. A shorter frame would silently +/// leave the bottom rows of those buffers stale from the previous +/// frame; a taller frame would overrun them. Either is a real +/// failure mode, but neither is a panic-worthy bug — the caller can +/// recover by rebuilding the sinker. Returning `Err` before any row +/// is processed guarantees no partial output. +#[cfg_attr(not(tarpaulin), inline(always))] +fn check_dimensions_match( + configured_w: usize, + configured_h: usize, + frame_w: u32, + frame_h: u32, +) -> Result<(), MixedSinkerError> { + let fw = frame_w as usize; + let fh = frame_h as usize; + if fw != configured_w || fh != configured_h { + return Err(MixedSinkerError::DimensionMismatch { + configured_w, + configured_h, + frame_w, + frame_h, + }); + } + Ok(()) +} + +#[cfg(all(test, feature = "std"))] +mod tests { + use super::*; + use crate::{ + ColorMatrix, + frame::{ + Nv12Frame, Nv21Frame, P010Frame, P012Frame, Yuv420p10Frame, Yuv420p12Frame, Yuv420p14Frame, + Yuv420pFrame, + }, + yuv::{ + nv12_to, nv21_to, p010_to, p012_to, yuv420p_to, yuv420p10_to, yuv420p12_to, yuv420p14_to, + }, + }; + + fn solid_yuv420p_frame( + width: u32, + height: u32, + y: u8, + u: u8, + v: u8, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) + } + + #[test] + fn luma_only_copies_y_plane() { + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 42, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&y| y == 42), "luma should be solid 42"); + } + + #[test] + fn rgb_only_converts_gray_to_gray() { + // Neutral chroma → gray RGB; solid Y=128 → ~128 in every RGB byte. + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn hsv_only_allocates_scratch_and_produces_gray_hsv() { + // Neutral gray → H=0, S=0, V=~128. No RGB buffer provided. + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut h = std::vec![0xFFu8; 16 * 8]; + let mut s = std::vec![0xFFu8; 16 * 8]; + let mut v = std::vec![0xFFu8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(128) <= 1)); + } + + #[test] + fn mixed_all_three_outputs_populated() { + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 200, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut luma = std::vec![0u8; 16 * 8]; + let mut h = std::vec![0u8; 16 * 8]; + let mut s = std::vec![0u8; 16 * 8]; + let mut v = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap() + .with_luma(&mut luma) + .unwrap() + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + // Luma = Y plane verbatim. + assert!(luma.iter().all(|&y| y == 200)); + // RGB gray. + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(200) <= 1); + } + // HSV of gray. + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); + } + + #[test] + fn rgb_with_hsv_uses_user_buffer_not_scratch() { + // When caller provides RGB, the scratch should remain empty (Vec len 0). + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 100, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut h = std::vec![0u8; 16 * 8]; + let mut s = std::vec![0u8; 16 * 8]; + let mut v = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap() + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert_eq!( + sink.rgb_scratch.len(), + 0, + "scratch should stay unallocated when RGB buffer is provided" + ); + } + + #[test] + fn with_simd_false_matches_with_simd_true() { + // A/B test: same frame, one sinker forces scalar, the other uses + // SIMD. NEON is bit‑exact to scalar so outputs must match. + let w = 32usize; + let h = 16usize; + let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200); + let src = Yuv420pFrame::new( &yp, &up, &vp, @@ -1483,728 +2070,1062 @@ mod tests { let mut rgb_simd = std::vec![0u8; w * h * 3]; let mut rgb_scalar = std::vec![0u8; w * h * 3]; - - let mut sink_simd = MixedSinker::::new(w, h) + + let mut sink_simd = MixedSinker::::new(w, h) + .with_rgb(&mut rgb_simd) + .unwrap(); + let mut sink_scalar = MixedSinker::::new(w, h) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_simd(false); + assert!(sink_simd.simd()); + assert!(!sink_scalar.simd()); + + yuv420p_to(&src, false, ColorMatrix::Bt709, &mut sink_simd).unwrap(); + yuv420p_to(&src, false, ColorMatrix::Bt709, &mut sink_scalar).unwrap(); + + assert_eq!(rgb_simd, rgb_scalar); + } + + #[test] + fn stride_padded_source_reads_correct_pixels() { + // 16×8 frame, Y stride 32 (padding), chroma stride 16. + let w = 16usize; + let h = 8usize; + let y_stride = 32usize; + let c_stride = 16usize; + let mut yp = std::vec![0xFFu8; y_stride * h]; // padding = 0xFF + let mut up = std::vec![0xFFu8; c_stride * h / 2]; + let mut vp = std::vec![0xFFu8; c_stride * h / 2]; + // Write actual pixel data in non-padding bytes. + for row in 0..h { + for x in 0..w { + yp[row * y_stride + x] = 50; + } + } + for row in 0..h / 2 { + for x in 0..w / 2 { + up[row * c_stride + x] = 128; + vp[row * c_stride + x] = 128; + } + } + + let src = Yuv420pFrame::new( + &yp, + &up, + &vp, + w as u32, + h as u32, + y_stride as u32, + c_stride as u32, + c_stride as u32, + ); + + let mut luma = std::vec![0u8; w * h]; + let mut sink = MixedSinker::::new(w, h) + .with_luma(&mut luma) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!( + luma.iter().all(|&y| y == 50), + "padding bytes leaked into output" + ); + } + + // ---- NV12 --------------------------------------------------------------- + + fn solid_nv12_frame(width: u32, height: u32, y: u8, u: u8, v: u8) -> (Vec, Vec) { + let w = width as usize; + let h = height as usize; + let ch = h / 2; + // UV row payload = `width` bytes = `width/2` interleaved UV pairs. + let mut uv = std::vec![0u8; w * ch]; + for row in 0..ch { + for i in 0..w / 2 { + uv[row * w + i * 2] = u; + uv[row * w + i * 2 + 1] = v; + } + } + (std::vec![y; w * h], uv) + } + + #[test] + fn nv12_luma_only_copies_y_plane() { + let (yp, uvp) = solid_nv12_frame(16, 8, 42, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&y| y == 42)); + } + + #[test] + fn nv12_rgb_only_converts_gray_to_gray() { + let (yp, uvp) = solid_nv12_frame(16, 8, 128, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn nv12_mixed_all_three_outputs_populated() { + let (yp, uvp) = solid_nv12_frame(16, 8, 200, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut luma = std::vec![0u8; 16 * 8]; + let mut h = std::vec![0u8; 16 * 8]; + let mut s = std::vec![0u8; 16 * 8]; + let mut v = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap() + .with_luma(&mut luma) + .unwrap() + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&y| y == 200)); + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(200) <= 1); + } + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); + } + + #[test] + fn nv12_with_simd_false_matches_with_simd_true() { + // 32×16 pseudo-random frame so the SIMD path exercises its main + // loop and the scalar path processes the full width too. + let w = 32usize; + let h = 16usize; + let yp: Vec = (0..w * h).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uvp: Vec = (0..w * h / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32); + + let mut rgb_simd = std::vec![0u8; w * h * 3]; + let mut rgb_scalar = std::vec![0u8; w * h * 3]; + let mut sink_simd = MixedSinker::::new(w, h) .with_rgb(&mut rgb_simd) .unwrap(); - let mut sink_scalar = MixedSinker::::new(w, h) + let mut sink_scalar = MixedSinker::::new(w, h) .with_rgb(&mut rgb_scalar) .unwrap() .with_simd(false); - assert!(sink_simd.simd()); - assert!(!sink_scalar.simd()); - - yuv420p_to(&src, false, ColorMatrix::Bt709, &mut sink_simd).unwrap(); - yuv420p_to(&src, false, ColorMatrix::Bt709, &mut sink_scalar).unwrap(); + nv12_to(&src, false, ColorMatrix::Bt709, &mut sink_simd).unwrap(); + nv12_to(&src, false, ColorMatrix::Bt709, &mut sink_scalar).unwrap(); assert_eq!(rgb_simd, rgb_scalar); } + // ---- preflight buffer-size errors ------------------------------------ + // + // Undersized RGB / luma / HSV buffers must be rejected at attachment + // time, not part-way through processing. Catching the mistake before + // any rows are written avoids partially-mutated caller buffers + // flagged by the adversarial review. With the fallible API these + // surface as `Err(MixedSinkerError::*BufferTooShort)` / `HsvPlaneTooShort`. + #[test] - fn stride_padded_source_reads_correct_pixels() { - // 16×8 frame, Y stride 32 (padding), chroma stride 16. - let w = 16usize; - let h = 8usize; - let y_stride = 32usize; - let c_stride = 16usize; - let mut yp = std::vec![0xFFu8; y_stride * h]; // padding = 0xFF - let mut up = std::vec![0xFFu8; c_stride * h / 2]; - let mut vp = std::vec![0xFFu8; c_stride * h / 2]; - // Write actual pixel data in non-padding bytes. - for row in 0..h { - for x in 0..w { - yp[row * y_stride + x] = 50; + fn attach_short_rgb_returns_err() { + let mut rgb = std::vec![0u8; 16 * 8 * 3 - 1]; // 1 byte short + let err = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .err() + .unwrap(); + assert_eq!( + err, + MixedSinkerError::RgbBufferTooShort { + expected: 16 * 8 * 3, + actual: 16 * 8 * 3 - 1, } - } - for row in 0..h / 2 { - for x in 0..w / 2 { - up[row * c_stride + x] = 128; - vp[row * c_stride + x] = 128; + ); + } + + #[test] + fn attach_short_luma_returns_err() { + let mut luma = std::vec![0u8; 16 * 8 - 1]; + let err = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .err() + .unwrap(); + assert_eq!( + err, + MixedSinkerError::LumaBufferTooShort { + expected: 16 * 8, + actual: 16 * 8 - 1, } - } + ); + } - let src = Yuv420pFrame::new( - &yp, - &up, - &vp, - w as u32, - h as u32, - y_stride as u32, - c_stride as u32, - c_stride as u32, + #[test] + fn attach_short_hsv_returns_err() { + let mut h = std::vec![0u8; 16 * 8]; + let mut s = std::vec![0u8; 16 * 8]; + let mut v = std::vec![0u8; 16 * 8 - 1]; // V plane short + let err = MixedSinker::::new(16, 8) + .with_hsv(&mut h, &mut s, &mut v) + .err() + .unwrap(); + assert_eq!( + err, + MixedSinkerError::HsvPlaneTooShort { + which: HsvPlane::V, + expected: 16 * 8, + actual: 16 * 8 - 1, + } ); + } - let mut luma = std::vec![0u8; w * h]; - let mut sink = MixedSinker::::new(w, h) + #[test] + fn taller_frame_returns_err_before_any_row_written() { + // Sink sized for 16×8, feed a 16×10 frame. `begin_frame` returns + // `Err(DimensionMismatch)` before row 0 — no partial writes. + let (yp, up, vp) = solid_yuv420p_frame(16, 10, 42, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 10, 16, 8, 8); + + const SENTINEL: u8 = 0xEE; + let mut luma = std::vec![SENTINEL; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) .with_luma(&mut luma) .unwrap(); - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) + .err() + .unwrap(); + assert_eq!( + err, + MixedSinkerError::DimensionMismatch { + configured_w: 16, + configured_h: 8, + frame_w: 16, + frame_h: 10, + } + ); + assert!( + luma.iter().all(|&b| b == SENTINEL), + "no rows should have been written before the Err" + ); + } + + #[test] + fn shorter_frame_returns_err_before_any_row_written() { + // Sink sized 16×8, frame is 16×4. Without the `begin_frame` + // preflight, the walker would silently process 4 rows and leave + // rows 4..7 stale from the previous frame. Preflight returns + // `Err(DimensionMismatch)` with no side effects. + let (yp, up, vp) = solid_yuv420p_frame(16, 4, 42, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 4, 16, 8, 8); + + const SENTINEL: u8 = 0xEE; + let mut luma = std::vec![SENTINEL; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) + .err() + .unwrap(); + assert_eq!( + err, + MixedSinkerError::DimensionMismatch { + configured_w: 16, + configured_h: 8, + frame_w: 16, + frame_h: 4, + } + ); + assert!( + luma.iter().all(|&b| b == SENTINEL), + "no rows should have been written before the Err" + ); + } + + #[test] + fn nv12_width_mismatch_returns_err() { + let (yp, uvp) = solid_nv12_frame(16, 8, 42, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + let mut rgb = std::vec![0u8; 32 * 8 * 3]; + let mut sink = MixedSinker::::new(32, 8).with_rgb(&mut rgb).unwrap(); + let err = nv12_to(&src, true, ColorMatrix::Bt601, &mut sink) + .err() + .unwrap(); assert!( - luma.iter().all(|&y| y == 50), - "padding bytes leaked into output" + matches!( + err, + MixedSinkerError::DimensionMismatch { + configured_w: 32, + frame_w: 16, + .. + } + ), + "unexpected error variant: {err:?}" + ); + } + + #[test] + fn yuv420p_width_mismatch_returns_err() { + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 42, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 32 * 8 * 3]; + let mut sink = MixedSinker::::new(32, 8) + .with_rgb(&mut rgb) + .unwrap(); + let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) + .err() + .unwrap(); + assert!( + matches!( + err, + MixedSinkerError::DimensionMismatch { + configured_w: 32, + frame_w: 16, + .. + } + ), + "unexpected error variant: {err:?}" ); } - // ---- NV12 --------------------------------------------------------------- - - fn solid_nv12_frame(width: u32, height: u32, y: u8, u: u8, v: u8) -> (Vec, Vec) { - let w = width as usize; - let h = height as usize; - let ch = h / 2; - // UV row payload = `width` bytes = `width/2` interleaved UV pairs. - let mut uv = std::vec![0u8; w * ch]; - for row in 0..ch { - for i in 0..w / 2 { - uv[row * w + i * 2] = u; - uv[row * w + i * 2 + 1] = v; - } - } - (std::vec![y; w * h], uv) - } - #[test] - fn nv12_luma_only_copies_y_plane() { - let (yp, uvp) = solid_nv12_frame(16, 8, 42, 128, 128); - let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn nv12_shorter_frame_returns_err_before_any_row_written() { + let (yp, uvp) = solid_nv12_frame(16, 4, 42, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 4, 16, 16); - let mut luma = std::vec![0u8; 16 * 8]; + const SENTINEL: u8 = 0xEE; + let mut luma = std::vec![SENTINEL; 16 * 8]; let mut sink = MixedSinker::::new(16, 8) .with_luma(&mut luma) .unwrap(); - nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - - assert!(luma.iter().all(|&y| y == 42)); + let err = nv12_to(&src, true, ColorMatrix::Bt601, &mut sink) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::DimensionMismatch { .. })); + assert!( + luma.iter().all(|&b| b == SENTINEL), + "no rows should have been written before the Err" + ); } + /// Sanity check that an Infallible sink (compile-time proof of + /// no-error) compiles and runs. Mirrors the trait-docs pattern. #[test] - fn nv12_rgb_only_converts_gray_to_gray() { - let (yp, uvp) = solid_nv12_frame(16, 8, 128, 128, 128); - let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); - - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + fn infallible_sink_compiles_and_runs() { + use core::convert::Infallible; - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); - assert_eq!(px[0], px[1]); - assert_eq!(px[1], px[2]); + struct RowCounter(usize); + impl PixelSink for RowCounter { + type Input<'a> = Yuv420pRow<'a>; + type Error = Infallible; + fn process(&mut self, _row: Yuv420pRow<'_>) -> Result<(), Infallible> { + self.0 += 1; + Ok(()) + } } + impl Yuv420pSink for RowCounter {} + + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + let mut counter = RowCounter(0); + // `Result<(), Infallible>` — the compiler knows Err is + // uninhabited, so `.unwrap()` here is free and infallible. + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut counter).unwrap(); + assert_eq!(counter.0, 8); } - #[test] - fn nv12_mixed_all_three_outputs_populated() { - let (yp, uvp) = solid_nv12_frame(16, 8, 200, 128, 128); - let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + // ---- direct process() bypass paths ---------------------------------- + // + // The walker normally guarantees (a) begin_frame runs first and + // validates frame dimensions, (b) row.y()/u/v/uv slices have the + // right length, (c) `idx < height`. A direct `process` call can + // break any of these. The defense-in-depth checks in `process` + // must return a specific error variant, not panic — verified here + // by constructing rows manually and calling `process`. + #[test] + fn yuv420p_process_rejects_short_y_slice() { let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut luma = std::vec![0u8; 16 * 8]; - let mut h = std::vec![0u8; 16 * 8]; - let mut s = std::vec![0u8; 16 * 8]; - let mut v = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb) - .unwrap() - .with_luma(&mut luma) - .unwrap() - .with_hsv(&mut h, &mut s, &mut v) - .unwrap(); - nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - - assert!(luma.iter().all(|&y| y == 200)); - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(200) <= 1); - } - assert!(h.iter().all(|&b| b == 0)); - assert!(s.iter().all(|&b| b == 0)); - assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); - } - - #[test] - fn nv12_with_simd_false_matches_with_simd_true() { - // 32×16 pseudo-random frame so the SIMD path exercises its main - // loop and the scalar path processes the full width too. - let w = 32usize; - let h = 16usize; - let yp: Vec = (0..w * h).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); - let uvp: Vec = (0..w * h / 2) - .map(|i| ((i * 53 + 23) & 0xFF) as u8) - .collect(); - let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32); - - let mut rgb_simd = std::vec![0u8; w * h * 3]; - let mut rgb_scalar = std::vec![0u8; w * h * 3]; - let mut sink_simd = MixedSinker::::new(w, h) - .with_rgb(&mut rgb_simd) .unwrap(); - let mut sink_scalar = MixedSinker::::new(w, h) - .with_rgb(&mut rgb_scalar) - .unwrap() - .with_simd(false); - nv12_to(&src, false, ColorMatrix::Bt709, &mut sink_simd).unwrap(); - nv12_to(&src, false, ColorMatrix::Bt709, &mut sink_scalar).unwrap(); - - assert_eq!(rgb_simd, rgb_scalar); + // Build a row with a 15-byte Y slice (wrong — sink configured for 16). + let y = [0u8; 15]; + let u = [128u8; 8]; + let v = [128u8; 8]; + let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); + assert_eq!( + err, + MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y, + row: 0, + expected: 16, + actual: 15, + } + ); } - // ---- preflight buffer-size errors ------------------------------------ - // - // Undersized RGB / luma / HSV buffers must be rejected at attachment - // time, not part-way through processing. Catching the mistake before - // any rows are written avoids partially-mutated caller buffers - // flagged by the adversarial review. With the fallible API these - // surface as `Err(MixedSinkerError::*BufferTooShort)` / `HsvPlaneTooShort`. - #[test] - fn attach_short_rgb_returns_err() { - let mut rgb = std::vec![0u8; 16 * 8 * 3 - 1]; // 1 byte short - let err = MixedSinker::::new(16, 8) + fn yuv420p_process_rejects_short_u_half() { + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb) - .err() .unwrap(); + let y = [0u8; 16]; + let u = [128u8; 7]; // expected 8 + let v = [128u8; 8]; + let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); assert_eq!( err, - MixedSinkerError::RgbBufferTooShort { - expected: 16 * 8 * 3, - actual: 16 * 8 * 3 - 1, + MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf, + row: 0, + expected: 8, + actual: 7, } ); } #[test] - fn attach_short_luma_returns_err() { - let mut luma = std::vec![0u8; 16 * 8 - 1]; - let err = MixedSinker::::new(16, 8) - .with_luma(&mut luma) - .err() + fn yuv420p_process_rejects_out_of_range_row_idx() { + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) .unwrap(); + let y = [0u8; 16]; + let u = [128u8; 8]; + let v = [128u8; 8]; + // idx = 8 exceeds configured height 8 — would otherwise panic on + // `rgb[idx * w * 3 ..]` indexing. + let row = Yuv420pRow::new(&y, &u, &v, 8, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); assert_eq!( err, - MixedSinkerError::LumaBufferTooShort { - expected: 16 * 8, - actual: 16 * 8 - 1, + MixedSinkerError::RowIndexOutOfRange { + row: 8, + configured_height: 8, } ); } #[test] - fn attach_short_hsv_returns_err() { - let mut h = std::vec![0u8; 16 * 8]; - let mut s = std::vec![0u8; 16 * 8]; - let mut v = std::vec![0u8; 16 * 8 - 1]; // V plane short - let err = MixedSinker::::new(16, 8) - .with_hsv(&mut h, &mut s, &mut v) - .err() - .unwrap(); + fn yuv420p_odd_width_sink_returns_err_at_begin_frame() { + // A sink configured with an odd width would later panic inside + // `yuv_420_to_rgb_row` (which asserts `width & 1 == 0`). The + // fallible API surfaces this as `OddWidth` at frame start — no + // rows are processed, no panic. Width=15, height=8 — matching + // frame so `DimensionMismatch` can't fire first. + let w = 15usize; + let h = 8usize; + let y = std::vec![0u8; w * h]; + let u = std::vec![128u8; ((w + 1) / 2) * h / 2 + 8]; // any valid size + let v = std::vec![128u8; ((w + 1) / 2) * h / 2 + 8]; + // Build the Frame separately — Yuv420pFrame rejects odd width + // too, so we can't construct a 15-wide frame. That's fine: we + // only need to hit `begin_frame`, which takes (width, height) + // parameters directly. Call it manually. + let mut rgb = std::vec![0u8; 16 * 8 * 3]; // Dummy; not touched. + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb) + .unwrap(); + let err = sink.begin_frame(w as u32, h as u32).err().unwrap(); + assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); + // Silence unused-vec warnings — these would have been the plane data. + let _ = (y, u, v); + } + + #[test] + fn yuv420p_odd_width_sink_returns_err_at_direct_process() { + // Direct `process` caller bypassing `begin_frame`. Process must + // still reject odd width before calling the kernel. + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(15, 8) + .with_rgb(&mut rgb) + .unwrap(); + let y = [0u8; 15]; + let u = [128u8; 7]; // ceil(15/2) = 8; 7 triggers the width check first + let v = [128u8; 7]; + let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); + assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); + } + + #[test] + fn nv12_odd_width_sink_returns_err_at_begin_frame() { + let w = 15usize; + let h = 8usize; + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(w, h).with_rgb(&mut rgb).unwrap(); + let err = sink.begin_frame(w as u32, h as u32).err().unwrap(); + assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); + } + + #[test] + fn nv12_odd_width_sink_returns_err_at_direct_process() { + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(15, 8).with_rgb(&mut rgb).unwrap(); + let y = [0u8; 15]; + let uv = [128u8; 15]; + let row = Nv12Row::new(&y, &uv, 0, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); + assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); + } + + #[test] + fn nv12_process_rejects_short_uv_slice() { + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + let y = [0u8; 16]; + let uv = [128u8; 15]; // expected 16 + let row = Nv12Row::new(&y, &uv, 0, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); assert_eq!( err, - MixedSinkerError::HsvPlaneTooShort { - which: HsvPlane::V, - expected: 16 * 8, - actual: 16 * 8 - 1, + MixedSinkerError::RowShapeMismatch { + which: RowSlice::UvHalf, + row: 0, + expected: 16, + actual: 15, } ); } #[test] - fn taller_frame_returns_err_before_any_row_written() { - // Sink sized for 16×8, feed a 16×10 frame. `begin_frame` returns - // `Err(DimensionMismatch)` before row 0 — no partial writes. - let (yp, up, vp) = solid_yuv420p_frame(16, 10, 42, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 10, 16, 8, 8); - - const SENTINEL: u8 = 0xEE; - let mut luma = std::vec![SENTINEL; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_luma(&mut luma) - .unwrap(); - let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) - .err() - .unwrap(); + fn nv12_process_rejects_out_of_range_row_idx() { + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + let y = [0u8; 16]; + let uv = [128u8; 16]; + let row = Nv12Row::new(&y, &uv, 8, ColorMatrix::Bt601, true); + let err = sink.process(row).err().unwrap(); assert_eq!( err, - MixedSinkerError::DimensionMismatch { - configured_w: 16, - configured_h: 8, - frame_w: 16, - frame_h: 10, + MixedSinkerError::RowIndexOutOfRange { + row: 8, + configured_height: 8, } ); - assert!( - luma.iter().all(|&b| b == SENTINEL), - "no rows should have been written before the Err" - ); } #[test] - fn shorter_frame_returns_err_before_any_row_written() { - // Sink sized 16×8, frame is 16×4. Without the `begin_frame` - // preflight, the walker would silently process 4 rows and leave - // rows 4..7 stale from the previous frame. Preflight returns - // `Err(DimensionMismatch)` with no side effects. - let (yp, up, vp) = solid_yuv420p_frame(16, 4, 42, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 4, 16, 8, 8); + fn nv12_matches_yuv420p_mixed_sinker() { + // Cross-format guarantee: an NV12 frame built from the same U / V + // bytes as a Yuv420p frame produces byte-identical RGB output via + // MixedSinker on both families. + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let up: Vec = (0..(ws / 2) * (hs / 2)) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let vp: Vec = (0..(ws / 2) * (hs / 2)) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + // Build NV12 UV plane: chroma row r, column c → uv[r * w + 2*c] = U, + // uv[r * w + 2*c + 1] = V, where U / V come from the same (r, c) + // sample of the planar fixture above. + let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; + for r in 0..hs / 2 { + for c in 0..ws / 2 { + uvp[r * ws + 2 * c] = up[r * (ws / 2) + c]; + uvp[r * ws + 2 * c + 1] = vp[r * (ws / 2) + c]; + } + } - const SENTINEL: u8 = 0xEE; - let mut luma = std::vec![SENTINEL; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_luma(&mut luma) + let yuv420p_src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2); + let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + + let mut rgb_yuv420p = std::vec![0u8; ws * hs * 3]; + let mut rgb_nv12 = std::vec![0u8; ws * hs * 3]; + let mut s_yuv = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb_yuv420p) .unwrap(); - let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) - .err() + let mut s_nv = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb_nv12) .unwrap(); - assert_eq!( - err, - MixedSinkerError::DimensionMismatch { - configured_w: 16, - configured_h: 8, - frame_w: 16, - frame_h: 4, + yuv420p_to(&yuv420p_src, false, ColorMatrix::Bt709, &mut s_yuv).unwrap(); + nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv).unwrap(); + + assert_eq!(rgb_yuv420p, rgb_nv12); + } + + // ---- NV21 MixedSinker --------------------------------------------------- + + fn solid_nv21_frame(width: u32, height: u32, y: u8, u: u8, v: u8) -> (Vec, Vec) { + let w = width as usize; + let h = height as usize; + let ch = h / 2; + // VU row payload = `width` bytes = `width/2` interleaved V/U pairs + // (V first). + let mut vu = std::vec![0u8; w * ch]; + for row in 0..ch { + for i in 0..w / 2 { + vu[row * w + i * 2] = v; + vu[row * w + i * 2 + 1] = u; } - ); - assert!( - luma.iter().all(|&b| b == SENTINEL), - "no rows should have been written before the Err" - ); + } + (std::vec![y; w * h], vu) } #[test] - fn nv12_width_mismatch_returns_err() { - let (yp, uvp) = solid_nv12_frame(16, 8, 42, 128, 128); - let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn nv21_luma_only_copies_y_plane() { + let (yp, vup) = solid_nv21_frame(16, 8, 42, 128, 128); + let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); - let mut rgb = std::vec![0u8; 32 * 8 * 3]; - let mut sink = MixedSinker::::new(32, 8).with_rgb(&mut rgb).unwrap(); - let err = nv12_to(&src, true, ColorMatrix::Bt601, &mut sink) - .err() + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) .unwrap(); - assert!( - matches!( - err, - MixedSinkerError::DimensionMismatch { - configured_w: 32, - frame_w: 16, - .. - } - ), - "unexpected error variant: {err:?}" - ); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&y| y == 42)); } #[test] - fn yuv420p_width_mismatch_returns_err() { - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 42, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn nv21_rgb_only_converts_gray_to_gray() { + let (yp, vup) = solid_nv21_frame(16, 8, 128, 128, 128); + let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); - let mut rgb = std::vec![0u8; 32 * 8 * 3]; - let mut sink = MixedSinker::::new(32, 8) - .with_rgb(&mut rgb) - .unwrap(); - let err = yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink) - .err() - .unwrap(); - assert!( - matches!( - err, - MixedSinkerError::DimensionMismatch { - configured_w: 32, - frame_w: 16, - .. - } - ), - "unexpected error variant: {err:?}" - ); + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } } #[test] - fn nv12_shorter_frame_returns_err_before_any_row_written() { - let (yp, uvp) = solid_nv12_frame(16, 4, 42, 128, 128); - let src = Nv12Frame::new(&yp, &uvp, 16, 4, 16, 16); + fn nv21_mixed_all_three_outputs_populated() { + let (yp, vup) = solid_nv21_frame(16, 8, 200, 128, 128); + let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); - const SENTINEL: u8 = 0xEE; - let mut luma = std::vec![SENTINEL; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut luma = std::vec![0u8; 16 * 8]; + let mut h = std::vec![0u8; 16 * 8]; + let mut s = std::vec![0u8; 16 * 8]; + let mut v = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap() .with_luma(&mut luma) + .unwrap() + .with_hsv(&mut h, &mut s, &mut v) .unwrap(); - let err = nv12_to(&src, true, ColorMatrix::Bt601, &mut sink) - .err() - .unwrap(); - assert!(matches!(err, MixedSinkerError::DimensionMismatch { .. })); - assert!( - luma.iter().all(|&b| b == SENTINEL), - "no rows should have been written before the Err" - ); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&y| y == 200)); + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(200) <= 1); + } + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); } - /// Sanity check that an Infallible sink (compile-time proof of - /// no-error) compiles and runs. Mirrors the trait-docs pattern. #[test] - fn infallible_sink_compiles_and_runs() { - use core::convert::Infallible; + fn nv21_matches_nv12_mixed_sinker_with_swapped_chroma() { + // Cross-format guarantee: an NV21 frame built from the same U / V + // bytes as an NV12 frame (just byte-swapped in the chroma plane) + // must produce identical RGB output via MixedSinker. + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; - struct RowCounter(usize); - impl PixelSink for RowCounter { - type Input<'a> = Yuv420pRow<'a>; - type Error = Infallible; - fn process(&mut self, _row: Yuv420pRow<'_>) -> Result<(), Infallible> { - self.0 += 1; - Ok(()) + let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; + for r in 0..hs / 2 { + for c in 0..ws / 2 { + uvp[r * ws + 2 * c] = ((c + r * 53) & 0xFF) as u8; // U + uvp[r * ws + 2 * c + 1] = ((c + r * 71) & 0xFF) as u8; // V + } + } + // Byte-swap each chroma pair to get the VU-ordered stream. + let mut vup: Vec = uvp.clone(); + for r in 0..hs / 2 { + for c in 0..ws / 2 { + vup[r * ws + 2 * c] = uvp[r * ws + 2 * c + 1]; + vup[r * ws + 2 * c + 1] = uvp[r * ws + 2 * c]; } } - impl Yuv420pSink for RowCounter {} - let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); - let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); - let mut counter = RowCounter(0); - // `Result<(), Infallible>` — the compiler knows Err is - // uninhabited, so `.unwrap()` here is free and infallible. - yuv420p_to(&src, true, ColorMatrix::Bt601, &mut counter).unwrap(); - assert_eq!(counter.0, 8); + let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + let nv21_src = Nv21Frame::new(&yp, &vup, w, h, w, w); + + let mut rgb_nv12 = std::vec![0u8; ws * hs * 3]; + let mut rgb_nv21 = std::vec![0u8; ws * hs * 3]; + let mut s_nv12 = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb_nv12) + .unwrap(); + let mut s_nv21 = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb_nv21) + .unwrap(); + nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv12).unwrap(); + nv21_to(&nv21_src, false, ColorMatrix::Bt709, &mut s_nv21).unwrap(); + + assert_eq!(rgb_nv12, rgb_nv21); } - // ---- direct process() bypass paths ---------------------------------- - // - // The walker normally guarantees (a) begin_frame runs first and - // validates frame dimensions, (b) row.y()/u/v/uv slices have the - // right length, (c) `idx < height`. A direct `process` call can - // break any of these. The defense-in-depth checks in `process` - // must return a specific error variant, not panic — verified here - // by constructing rows manually and calling `process`. + // ---- Yuv420p10 -------------------------------------------------------- - #[test] - fn yuv420p_process_rejects_short_y_slice() { - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) - .unwrap(); - // Build a row with a 15-byte Y slice (wrong — sink configured for 16). - let y = [0u8; 15]; - let u = [128u8; 8]; - let v = [128u8; 8]; - let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!( - err, - MixedSinkerError::RowShapeMismatch { - which: RowSlice::Y, - row: 0, - expected: 16, - actual: 15, - } - ); + fn solid_yuv420p10_frame( + width: u32, + height: u32, + y: u16, + u: u16, + v: u16, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) } #[test] - fn yuv420p_process_rejects_short_u_half() { + fn yuv420p10_rgb_u8_only_gray_is_gray() { + // 10-bit mid-gray: Y=512, UV=512 → 8-bit RGB ≈ 128 on every channel. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb) .unwrap(); - let y = [0u8; 16]; - let u = [128u8; 7]; // expected 8 - let v = [128u8; 8]; - let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!( - err, - MixedSinkerError::RowShapeMismatch { - which: RowSlice::UHalf, - row: 0, - expected: 8, - actual: 7, - } - ); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } } #[test] - fn yuv420p_process_rejects_out_of_range_row_idx() { - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) + fn yuv420p10_rgb_u16_only_native_depth_gray() { + // Same mid-gray frame → u16 RGB output in native 10-bit depth, so + // each channel should be ≈ 512 (the 10-bit mid). + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) .unwrap(); - let y = [0u8; 16]; - let u = [128u8; 8]; - let v = [128u8; 8]; - // idx = 8 exceeds configured height 8 — would otherwise panic on - // `rgb[idx * w * 3 ..]` indexing. - let row = Yuv420pRow::new(&y, &u, &v, 8, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!( - err, - MixedSinkerError::RowIndexOutOfRange { - row: 8, - configured_height: 8, - } - ); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + // Upper 6 bits of each u16 must be zero — 10-bit convention. + assert!(px[0] <= 1023); + } } #[test] - fn yuv420p_odd_width_sink_returns_err_at_begin_frame() { - // A sink configured with an odd width would later panic inside - // `yuv_420_to_rgb_row` (which asserts `width & 1 == 0`). The - // fallible API surfaces this as `OddWidth` at frame start — no - // rows are processed, no panic. Width=15, height=8 — matching - // frame so `DimensionMismatch` can't fire first. - let w = 15usize; - let h = 8usize; - let y = std::vec![0u8; w * h]; - let u = std::vec![128u8; ((w + 1) / 2) * h / 2 + 8]; // any valid size - let v = std::vec![128u8; ((w + 1) / 2) * h / 2 + 8]; - // Build the Frame separately — Yuv420pFrame rejects odd width - // too, so we can't construct a 15-wide frame. That's fine: we - // only need to hit `begin_frame`, which takes (width, height) - // parameters directly. Call it manually. - let mut rgb = std::vec![0u8; 16 * 8 * 3]; // Dummy; not touched. - let mut sink = MixedSinker::::new(w, h) - .with_rgb(&mut rgb) + fn yuv420p10_rgb_u8_and_u16_both_populated() { + // 10-bit full-range white: Y=1023, UV=512. Both buffers should + // fill with their respective "white" values (255 for u8, 1023 for + // u16) in the same call. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 1023, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) .unwrap(); - let err = sink.begin_frame(w as u32, h as u32).err().unwrap(); - assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); - // Silence unused-vec warnings — these would have been the plane data. - let _ = (y, u, v); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 1023)); } #[test] - fn yuv420p_odd_width_sink_returns_err_at_direct_process() { - // Direct `process` caller bypassing `begin_frame`. Process must - // still reject odd width before calling the kernel. - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(15, 8) - .with_rgb(&mut rgb) + fn yuv420p10_luma_downshifts_to_8bit() { + // Y=512 at 10 bits → 512 >> 2 = 128 at 8 bits. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) .unwrap(); - let y = [0u8; 15]; - let u = [128u8; 7]; // ceil(15/2) = 8; 7 triggers the width check first - let v = [128u8; 7]; - let row = Yuv420pRow::new(&y, &u, &v, 0, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); - } + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - #[test] - fn nv12_odd_width_sink_returns_err_at_begin_frame() { - let w = 15usize; - let h = 8usize; - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(w, h).with_rgb(&mut rgb).unwrap(); - let err = sink.begin_frame(w as u32, h as u32).err().unwrap(); - assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); + assert!(luma.iter().all(|&l| l == 128)); } #[test] - fn nv12_odd_width_sink_returns_err_at_direct_process() { - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(15, 8).with_rgb(&mut rgb).unwrap(); - let y = [0u8; 15]; - let uv = [128u8; 15]; - let row = Nv12Row::new(&y, &uv, 0, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!(err, MixedSinkerError::OddWidth { width: 15 }); - } + fn yuv420p10_hsv_from_gray_is_zero_hue_zero_sat() { + // HSV derived from the internal u8 RGB scratch: neutral gray → + // H=0, S=0, V≈128. Exercises the "HSV without RGB" scratch path + // on the 10-bit source. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); - #[test] - fn nv12_process_rejects_short_uv_slice() { - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - let y = [0u8; 16]; - let uv = [128u8; 15]; // expected 16 - let row = Nv12Row::new(&y, &uv, 0, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!( - err, - MixedSinkerError::RowShapeMismatch { - which: RowSlice::UvHalf, - row: 0, - expected: 16, - actual: 15, - } - ); + let mut h = std::vec![0xFFu8; 16 * 8]; + let mut s = std::vec![0xFFu8; 16 * 8]; + let mut v = std::vec![0xFFu8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(128) <= 1)); } #[test] - fn nv12_process_rejects_out_of_range_row_idx() { - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - let y = [0u8; 16]; - let uv = [128u8; 16]; - let row = Nv12Row::new(&y, &uv, 8, ColorMatrix::Bt601, true); - let err = sink.process(row).err().unwrap(); - assert_eq!( - err, - MixedSinkerError::RowIndexOutOfRange { - row: 8, - configured_height: 8, - } - ); + fn yuv420p10_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; // Way too small. + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); } #[test] - fn nv12_matches_yuv420p_mixed_sinker() { - // Cross-format guarantee: an NV12 frame built from the same U / V - // bytes as a Yuv420p frame produces byte-identical RGB output via - // MixedSinker on both families. - let w = 32u32; - let h = 16u32; - let ws = w as usize; - let hs = h as usize; - let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); - let up: Vec = (0..(ws / 2) * (hs / 2)) - .map(|i| ((i * 53 + 23) & 0xFF) as u8) - .collect(); - let vp: Vec = (0..(ws / 2) * (hs / 2)) - .map(|i| ((i * 71 + 91) & 0xFF) as u8) - .collect(); - // Build NV12 UV plane: chroma row r, column c → uv[r * w + 2*c] = U, - // uv[r * w + 2*c + 1] = V, where U / V come from the same (r, c) - // sample of the planar fixture above. - let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; - for r in 0..hs / 2 { - for c in 0..ws / 2 { - uvp[r * ws + 2 * c] = up[r * (ws / 2) + c]; - uvp[r * ws + 2 * c + 1] = vp[r * (ws / 2) + c]; - } - } - - let yuv420p_src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2); - let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + fn yuv420p10_with_simd_false_matches_with_simd_true() { + // The SIMD toggle exercises scalar-vs-SIMD dispatch. Both paths + // must produce byte-identical results on both outputs. + let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); - let mut rgb_yuv420p = std::vec![0u8; ws * hs * 3]; - let mut rgb_nv12 = std::vec![0u8; ws * hs * 3]; - let mut s_yuv = MixedSinker::::new(ws, hs) - .with_rgb(&mut rgb_yuv420p) + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) .unwrap(); - let mut s_nv = MixedSinker::::new(ws, hs) - .with_rgb(&mut rgb_nv12) + yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) .unwrap(); - yuv420p_to(&yuv420p_src, false, ColorMatrix::Bt709, &mut s_yuv).unwrap(); - nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv).unwrap(); + yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); - assert_eq!(rgb_yuv420p, rgb_nv12); + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); } - // ---- NV21 MixedSinker --------------------------------------------------- + // ---- P010 -------------------------------------------------------------- + // + // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each + // u16). Mirrors the Yuv420p10 test shape but with UV interleaved. - fn solid_nv21_frame(width: u32, height: u32, y: u8, u: u8, v: u8) -> (Vec, Vec) { + fn solid_p010_frame( + width: u32, + height: u32, + y_10bit: u16, + u_10bit: u16, + v_10bit: u16, + ) -> (Vec, Vec) { let w = width as usize; let h = height as usize; + let cw = w / 2; let ch = h / 2; - // VU row payload = `width` bytes = `width/2` interleaved V/U pairs - // (V first). - let mut vu = std::vec![0u8; w * ch]; - for row in 0..ch { - for i in 0..w / 2 { - vu[row * w + i * 2] = v; - vu[row * w + i * 2 + 1] = u; - } - } - (std::vec![y; w * h], vu) + // Shift into the high 10 bits (P010 packing). + let y = std::vec![y_10bit << 6; w * h]; + let uv: Vec = (0..cw * ch) + .flat_map(|_| [u_10bit << 6, v_10bit << 6]) + .collect(); + (y, uv) } #[test] - fn nv21_luma_only_copies_y_plane() { - let (yp, vup) = solid_nv21_frame(16, 8, 42, 128, 128); - let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); + fn p010_rgb_u8_only_gray_is_gray() { + // 10-bit mid-gray Y=512, UV=512 → ~128 u8 RGB across the frame. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); - let mut luma = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_luma(&mut luma) - .unwrap(); - nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - assert!(luma.iter().all(|&y| y == 42)); + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } } #[test] - fn nv21_rgb_only_converts_gray_to_gray() { - let (yp, vup) = solid_nv21_frame(16, 8, 128, 128, 128); - let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); + fn p010_rgb_u16_only_native_depth_gray() { + // Output u16 is yuv420p10le-packed (10-bit in low 10) even though + // the input is P010-packed. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); for px in rgb.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); + assert!( + px[0] <= 1023, + "output must stay within 10-bit low-packed range" + ); } } #[test] - fn nv21_mixed_all_three_outputs_populated() { - let (yp, vup) = solid_nv21_frame(16, 8, 200, 128, 128); - let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); + fn p010_rgb_u8_and_u16_both_populated() { + // 10-bit full-range white: Y=1023, UV=512. Both buffers fill in + // one call. + let (yp, uvp) = solid_p010_frame(16, 8, 1023, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut luma = std::vec![0u8; 16 * 8]; - let mut h = std::vec![0u8; 16 * 8]; - let mut s = std::vec![0u8; 16 * 8]; - let mut v = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 1023)); + } + + #[test] + fn p010_luma_downshifts_to_8bit() { + // Y=512 at 10 bits, P010-packed (0x8000). After >> 8, the 8-bit + // luma is 0x80 = 128. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) .with_luma(&mut luma) - .unwrap() - .with_hsv(&mut h, &mut s, &mut v) .unwrap(); - nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - assert!(luma.iter().all(|&y| y == 200)); - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(200) <= 1); - } - assert!(h.iter().all(|&b| b == 0)); - assert!(s.iter().all(|&b| b == 0)); - assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); + assert!(luma.iter().all(|&l| l == 128)); } #[test] - fn nv21_matches_nv12_mixed_sinker_with_swapped_chroma() { - // Cross-format guarantee: an NV21 frame built from the same U / V - // bytes as an NV12 frame (just byte-swapped in the chroma plane) - // must produce identical RGB output via MixedSinker. - let w = 32u32; - let h = 16u32; - let ws = w as usize; - let hs = h as usize; + fn p010_matches_yuv420p10_mixed_sinker_with_shifted_samples() { + // Logical equivalence: same samples fed through the two formats + // (low-packed as yuv420p10, high-packed as P010) must produce + // byte-identical u8 RGB. + let w = 16u32; + let h = 8u32; + let y = 600u16; + let u = 400u16; + let v = 700u16; - let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); - let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; - for r in 0..hs / 2 { - for c in 0..ws / 2 { - uvp[r * ws + 2 * c] = ((c + r * 53) & 0xFF) as u8; // U - uvp[r * ws + 2 * c + 1] = ((c + r * 71) & 0xFF) as u8; // V - } - } - // Byte-swap each chroma pair to get the VU-ordered stream. - let mut vup: Vec = uvp.clone(); - for r in 0..hs / 2 { - for c in 0..ws / 2 { - vup[r * ws + 2 * c] = uvp[r * ws + 2 * c + 1]; - vup[r * ws + 2 * c + 1] = uvp[r * ws + 2 * c]; - } - } + let (yp_p10, up_p10, vp_p10) = solid_yuv420p10_frame(w, h, y, u, v); + let src_p10 = Yuv420p10Frame::new(&yp_p10, &up_p10, &vp_p10, w, h, w, w / 2, w / 2); + + let (yp_p010, uvp_p010) = solid_p010_frame(w, h, y, u, v); + let src_p010 = P010Frame::new(&yp_p010, &uvp_p010, w, h, w, w); + + let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize]; + let mut rgb_p010 = std::vec![0u8; (w * h * 3) as usize]; + let mut s_yuv = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_yuv) + .unwrap(); + let mut s_p010 = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_p010) + .unwrap(); + yuv420p10_to(&src_p10, true, ColorMatrix::Bt709, &mut s_yuv).unwrap(); + p010_to(&src_p010, true, ColorMatrix::Bt709, &mut s_p010).unwrap(); + assert_eq!(rgb_yuv, rgb_p010); + } + + #[test] + fn p010_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); + } - let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); - let nv21_src = Nv21Frame::new(&yp, &vup, w, h, w, w); + #[test] + fn p010_with_simd_false_matches_with_simd_true() { + // Stubs delegate to scalar so simd=true and simd=false produce + // byte-identical output for now. Real SIMD backends will replace + // the stubs — equivalence is preserved by design. + let (yp, uvp) = solid_p010_frame(64, 16, 600, 400, 700); + let src = P010Frame::new(&yp, &uvp, 64, 16, 64, 64); - let mut rgb_nv12 = std::vec![0u8; ws * hs * 3]; - let mut rgb_nv21 = std::vec![0u8; ws * hs * 3]; - let mut s_nv12 = MixedSinker::::new(ws, hs) - .with_rgb(&mut rgb_nv12) + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) .unwrap(); - let mut s_nv21 = MixedSinker::::new(ws, hs) - .with_rgb(&mut rgb_nv21) + p010_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) .unwrap(); - nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv12).unwrap(); - nv21_to(&nv21_src, false, ColorMatrix::Bt709, &mut s_nv21).unwrap(); + p010_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); - assert_eq!(rgb_nv12, rgb_nv21); + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); } - // ---- Yuv420p10 -------------------------------------------------------- + // ---- Yuv420p12 --------------------------------------------------------- + // + // Planar 12-bit, low-bit-packed. Mirrors the Yuv420p10 shape — same + // planar layout, wider sample range. `mid-gray` for 12-bit is + // Y=UV=2048; native-depth white (full-range) is 4095. - fn solid_yuv420p10_frame( + fn solid_yuv420p12_frame( width: u32, height: u32, y: u16, @@ -2223,16 +3144,15 @@ mod tests { } #[test] - fn yuv420p10_rgb_u8_only_gray_is_gray() { - // 10-bit mid-gray: Y=512, UV=512 → 8-bit RGB ≈ 128 on every channel. - let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn yuv420p12_rgb_u8_only_gray_is_gray() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb) .unwrap(); - yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); @@ -2242,78 +3162,71 @@ mod tests { } #[test] - fn yuv420p10_rgb_u16_only_native_depth_gray() { - // Same mid-gray frame → u16 RGB output in native 10-bit depth, so - // each channel should be ≈ 512 (the 10-bit mid). - let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn yuv420p12_rgb_u16_only_native_depth_gray() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); let mut rgb = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb_u16(&mut rgb) .unwrap(); - yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); for px in rgb.chunks(3) { - assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); - // Upper 6 bits of each u16 must be zero — 10-bit convention. - assert!(px[0] <= 1023); + // Upper 4 bits must be zero — 12-bit low-packed convention. + assert!(px[0] <= 4095); } } #[test] - fn yuv420p10_rgb_u8_and_u16_both_populated() { - // 10-bit full-range white: Y=1023, UV=512. Both buffers should - // fill with their respective "white" values (255 for u8, 1023 for - // u16) in the same call. - let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 1023, 512, 512); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn yuv420p12_rgb_u8_and_u16_both_populated() { + // Full-range white: Y=4095, UV=2048. + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 4095, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb_u8) .unwrap() .with_rgb_u16(&mut rgb_u16) .unwrap(); - yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); assert!(rgb_u8.iter().all(|&c| c == 255)); - assert!(rgb_u16.iter().all(|&c| c == 1023)); + assert!(rgb_u16.iter().all(|&c| c == 4095)); } #[test] - fn yuv420p10_luma_downshifts_to_8bit() { - // Y=512 at 10 bits → 512 >> 2 = 128 at 8 bits. - let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn yuv420p12_luma_downshifts_to_8bit() { + // Y=2048 at 12 bits → 2048 >> (12 - 8) = 128 at 8 bits. + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); let mut luma = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_luma(&mut luma) .unwrap(); - yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); assert!(luma.iter().all(|&l| l == 128)); } #[test] - fn yuv420p10_hsv_from_gray_is_zero_hue_zero_sat() { - // HSV derived from the internal u8 RGB scratch: neutral gray → - // H=0, S=0, V≈128. Exercises the "HSV without RGB" scratch path - // on the 10-bit source. - let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + fn yuv420p12_hsv_from_gray_is_zero_hue_zero_sat() { + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); let mut h = std::vec![0xFFu8; 16 * 8]; let mut s = std::vec![0xFFu8; 16 * 8]; let mut v = std::vec![0xFFu8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_hsv(&mut h, &mut s, &mut v) .unwrap(); - yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); assert!(h.iter().all(|&b| b == 0)); assert!(s.iter().all(|&b| b == 0)); @@ -2321,9 +3234,9 @@ mod tests { } #[test] - fn yuv420p10_rgb_u16_too_short_returns_err() { - let mut rgb = std::vec![0u16; 10]; // Way too small. - let err = MixedSinker::::new(16, 8) + fn yuv420p12_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) .with_rgb_u16(&mut rgb) .err() .unwrap(); @@ -2331,68 +3244,184 @@ mod tests { } #[test] - fn yuv420p10_with_simd_false_matches_with_simd_true() { - // The SIMD toggle exercises scalar-vs-SIMD dispatch. Both paths - // must produce byte-identical results on both outputs. - let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700); - let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + fn yuv420p12_with_simd_false_matches_with_simd_true() { + let (yp, up, vp) = solid_yuv420p12_frame(64, 16, 2400, 1600, 2800); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; - let mut s_scalar = MixedSinker::::new(64, 16) + let mut s_scalar = MixedSinker::::new(64, 16) .with_simd(false) .with_rgb(&mut rgb_scalar) .unwrap() .with_rgb_u16(&mut rgb_u16_scalar) .unwrap(); - yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; - let mut s_simd = MixedSinker::::new(64, 16) + let mut s_simd = MixedSinker::::new(64, 16) .with_rgb(&mut rgb_simd) .unwrap() .with_rgb_u16(&mut rgb_u16_simd) .unwrap(); - yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + yuv420p12_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); assert_eq!(rgb_scalar, rgb_simd); assert_eq!(rgb_u16_scalar, rgb_u16_simd); } - // ---- P010 -------------------------------------------------------------- + // ---- Yuv420p14 --------------------------------------------------------- + + fn solid_yuv420p14_frame( + width: u32, + height: u32, + y: u16, + u: u16, + v: u16, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) + } + + #[test] + fn yuv420p14_rgb_u8_only_gray_is_gray() { + // 14-bit mid-gray: Y=UV=8192. + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn yuv420p14_rgb_u16_only_native_depth_gray() { + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(8192) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert!(px[0] <= 16383); + } + } + + #[test] + fn yuv420p14_luma_downshifts_to_8bit() { + // Y=8192 at 14 bits → 8192 >> (14 - 8) = 128. + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn yuv420p14_rgb_u8_and_u16_both_populated() { + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 16383, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 16383)); + } + + #[test] + fn yuv420p14_with_simd_false_matches_with_simd_true() { + let (yp, up, vp) = solid_yuv420p14_frame(64, 16, 9600, 6400, 11200); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + yuv420p14_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } + + // ---- P012 -------------------------------------------------------------- // - // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each - // u16). Mirrors the Yuv420p10 test shape but with UV interleaved. + // Semi-planar 12-bit, high-bit-packed (samples in high 12 of each + // u16). Mirrors the P010 test shape — UV interleaved, `value << 4`. - fn solid_p010_frame( + fn solid_p012_frame( width: u32, height: u32, - y_10bit: u16, - u_10bit: u16, - v_10bit: u16, + y_12bit: u16, + u_12bit: u16, + v_12bit: u16, ) -> (Vec, Vec) { let w = width as usize; let h = height as usize; let cw = w / 2; let ch = h / 2; - // Shift into the high 10 bits (P010 packing). - let y = std::vec![y_10bit << 6; w * h]; + // Shift into the high 12 bits (P012 packing). + let y = std::vec![y_12bit << 4; w * h]; let uv: Vec = (0..cw * ch) - .flat_map(|_| [u_10bit << 6, v_10bit << 6]) + .flat_map(|_| [u_12bit << 4, v_12bit << 4]) .collect(); (y, uv) } #[test] - fn p010_rgb_u8_only_gray_is_gray() { - // 10-bit mid-gray Y=512, UV=512 → ~128 u8 RGB across the frame. - let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); - let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn p012_rgb_u8_only_gray_is_gray() { + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); @@ -2402,99 +3431,96 @@ mod tests { } #[test] - fn p010_rgb_u16_only_native_depth_gray() { - // Output u16 is yuv420p10le-packed (10-bit in low 10) even though - // the input is P010-packed. - let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); - let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn p012_rgb_u16_only_native_depth_gray() { + // Output is low-bit-packed 12-bit (yuv420p12le convention). + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); let mut rgb = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb_u16(&mut rgb) .unwrap(); - p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); for px in rgb.chunks(3) { - assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); assert!( - px[0] <= 1023, - "output must stay within 10-bit low-packed range" + px[0] <= 4095, + "output must stay within 12-bit low-packed range" ); } } #[test] - fn p010_rgb_u8_and_u16_both_populated() { - // 10-bit full-range white: Y=1023, UV=512. Both buffers fill in - // one call. - let (yp, uvp) = solid_p010_frame(16, 8, 1023, 512, 512); - let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn p012_rgb_u8_and_u16_both_populated() { + let (yp, uvp) = solid_p012_frame(16, 8, 4095, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb_u8) .unwrap() .with_rgb_u16(&mut rgb_u16) .unwrap(); - p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); assert!(rgb_u8.iter().all(|&c| c == 255)); - assert!(rgb_u16.iter().all(|&c| c == 1023)); + assert!(rgb_u16.iter().all(|&c| c == 4095)); } #[test] - fn p010_luma_downshifts_to_8bit() { - // Y=512 at 10 bits, P010-packed (0x8000). After >> 8, the 8-bit - // luma is 0x80 = 128. - let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); - let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + fn p012_luma_downshifts_to_8bit() { + // Y=2048 at 12 bits, P012-packed (2048 << 4 = 0x8000). After >> 8, + // the 8-bit luma is 0x80 = 128 — same accessor as P010 since both + // store active bits in the high positions. + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); let mut luma = std::vec![0u8; 16 * 8]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_luma(&mut luma) .unwrap(); - p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); assert!(luma.iter().all(|&l| l == 128)); } #[test] - fn p010_matches_yuv420p10_mixed_sinker_with_shifted_samples() { - // Logical equivalence: same samples fed through the two formats - // (low-packed as yuv420p10, high-packed as P010) must produce - // byte-identical u8 RGB. + fn p012_matches_yuv420p12_mixed_sinker_with_shifted_samples() { + // Logical equivalence — same 12-bit samples fed through both + // layouts must produce byte-identical u8 RGB. let w = 16u32; let h = 8u32; - let y = 600u16; - let u = 400u16; - let v = 700u16; + let y = 2400u16; + let u = 1600u16; + let v = 2800u16; - let (yp_p10, up_p10, vp_p10) = solid_yuv420p10_frame(w, h, y, u, v); - let src_p10 = Yuv420p10Frame::new(&yp_p10, &up_p10, &vp_p10, w, h, w, w / 2, w / 2); + let (yp_p12, up_p12, vp_p12) = solid_yuv420p12_frame(w, h, y, u, v); + let src_p12 = Yuv420p12Frame::new(&yp_p12, &up_p12, &vp_p12, w, h, w, w / 2, w / 2); - let (yp_p010, uvp_p010) = solid_p010_frame(w, h, y, u, v); - let src_p010 = P010Frame::new(&yp_p010, &uvp_p010, w, h, w, w); + let (yp_p012, uvp_p012) = solid_p012_frame(w, h, y, u, v); + let src_p012 = P012Frame::new(&yp_p012, &uvp_p012, w, h, w, w); let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize]; - let mut rgb_p010 = std::vec![0u8; (w * h * 3) as usize]; - let mut s_yuv = MixedSinker::::new(w as usize, h as usize) + let mut rgb_p012 = std::vec![0u8; (w * h * 3) as usize]; + let mut s_yuv = MixedSinker::::new(w as usize, h as usize) .with_rgb(&mut rgb_yuv) .unwrap(); - let mut s_p010 = MixedSinker::::new(w as usize, h as usize) - .with_rgb(&mut rgb_p010) + let mut s_p012 = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_p012) .unwrap(); - yuv420p10_to(&src_p10, true, ColorMatrix::Bt709, &mut s_yuv).unwrap(); - p010_to(&src_p010, true, ColorMatrix::Bt709, &mut s_p010).unwrap(); - assert_eq!(rgb_yuv, rgb_p010); + yuv420p12_to(&src_p12, true, ColorMatrix::Bt709, &mut s_yuv).unwrap(); + p012_to(&src_p012, true, ColorMatrix::Bt709, &mut s_p012).unwrap(); + assert_eq!(rgb_yuv, rgb_p012); } #[test] - fn p010_rgb_u16_too_short_returns_err() { + fn p012_rgb_u16_too_short_returns_err() { let mut rgb = std::vec![0u16; 10]; - let err = MixedSinker::::new(16, 8) + let err = MixedSinker::::new(16, 8) .with_rgb_u16(&mut rgb) .err() .unwrap(); @@ -2502,31 +3528,28 @@ mod tests { } #[test] - fn p010_with_simd_false_matches_with_simd_true() { - // Stubs delegate to scalar so simd=true and simd=false produce - // byte-identical output for now. Real SIMD backends will replace - // the stubs — equivalence is preserved by design. - let (yp, uvp) = solid_p010_frame(64, 16, 600, 400, 700); - let src = P010Frame::new(&yp, &uvp, 64, 16, 64, 64); + fn p012_with_simd_false_matches_with_simd_true() { + let (yp, uvp) = solid_p012_frame(64, 16, 2400, 1600, 2800); + let src = P012Frame::new(&yp, &uvp, 64, 16, 64, 64); let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; - let mut s_scalar = MixedSinker::::new(64, 16) + let mut s_scalar = MixedSinker::::new(64, 16) .with_simd(false) .with_rgb(&mut rgb_scalar) .unwrap() .with_rgb_u16(&mut rgb_u16_scalar) .unwrap(); - p010_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + p012_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; - let mut s_simd = MixedSinker::::new(64, 16) + let mut s_simd = MixedSinker::::new(64, 16) .with_rgb(&mut rgb_simd) .unwrap() .with_rgb_u16(&mut rgb_u16_simd) .unwrap(); - p010_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + p012_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); assert_eq!(rgb_scalar, rgb_simd); assert_eq!(rgb_u16_scalar, rgb_u16_simd); diff --git a/src/sinker/mod.rs b/src/sinker/mod.rs index e6d6d0a..90ce325 100644 --- a/src/sinker/mod.rs +++ b/src/sinker/mod.rs @@ -1,10 +1,11 @@ //! [`PixelSink`](crate::PixelSink) implementations shipped with the //! crate. //! -//! v0.1 ships [`MixedSinker`](mixed::MixedSinker), which writes any -//! subset of `{RGB, Luma, HSV}` into caller-provided buffers. Narrow -//! newtype shortcuts (luma-only, RGB-only, HSV-only) will be added in -//! follow-up commits once the MixedSinker path is proven. +//! Currently ships [`MixedSinker`](mixed::MixedSinker), which writes +//! any subset of `{RGB, Luma, HSV}` into caller-provided buffers. +//! It has per-format `PixelSink` impls for all eight shipped YUV +//! source formats (see [`crate::yuv`] for the list). Narrow newtype +//! shortcuts (luma-only, RGB-only, HSV-only) are a follow-up. //! //! `MixedSinker` keeps a lazily‑grown `Vec` scratch buffer for //! the HSV‑without‑RGB path, so it is only compiled under the `std` diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs index eedc2ab..fcbe395 100644 --- a/src/yuv/mod.rs +++ b/src/yuv/mod.rs @@ -1,6 +1,9 @@ //! YUV source kernels. //! -//! One sub-module and kernel per YUV pixel-format family: +//! One sub-module and kernel per YUV pixel-format family. +//! +//! # Shipped (8-bit 4:2:0) +//! //! - [`Yuv420p`](crate::yuv::Yuv420p) — the mainline 4:2:0 **planar** //! layout (H.264 / HEVC / AV1 / VP9 software‑decode default). //! - [`Nv12`](crate::yuv::Nv12) — 4:2:0 **semi‑planar** with interleaved @@ -8,22 +11,55 @@ //! default). //! - [`Nv21`](crate::yuv::Nv21) — 4:2:0 semi‑planar with **VU**-ordered //! chroma (Android MediaCodec default). +//! +//! # Shipped (high-bit-depth 4:2:0, low-bit-packed planar) +//! //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits //! per sample (HDR10 / 10‑bit SDR software decode). +//! - [`Yuv420p12`](crate::yuv::Yuv420p12) — 4:2:0 planar at 12 bits +//! per sample (HEVC Main 12 / VP9 Profile 3 software decode). +//! - [`Yuv420p14`](crate::yuv::Yuv420p14) — 4:2:0 planar at 14 bits +//! per sample (grading / mastering pipelines). +//! +//! # Shipped (high-bit-depth 4:2:0, high-bit-packed semi-planar) +//! //! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per //! sample, high‑bit‑packed (HDR hardware decode: VideoToolbox, //! VA‑API, NVDEC, D3D11VA, Intel QSV). +//! - [`P012`](crate::yuv::P012) — 4:2:0 semi‑planar at 12 bits per +//! sample, high‑bit‑packed (HEVC Main 12 / VP9 Profile 3 hardware +//! decode). +//! +//! # Not yet shipped //! -//! Other families land in follow-up commits. +//! - **16‑bit** (`Yuv420p16` / `P016`) — blocked on a separate +//! kernel family. At `BITS == 16` the Q15 chroma_sum overflows +//! i32, so this needs either i64 intermediates or a lower‑Q +//! coefficient format. The scalar and SIMD kernels here +//! deliberately gate `BITS` to `{10, 12, 14}` (planar) and +//! `{10, 12}` (semi‑planar) via `debug_assert!`. +//! - **4:2:2 / 4:4:4** (`Yuv422p`, `Yuv444p`, `Nv16`, `Nv24`, +//! `Nv42`) — follow‑up, not yet started. They share the scalar +//! Q15 math but need their own row walkers (different chroma +//! subsampling / stride). +//! - **Packed RGB sources** (`Rgb24`, `Bgr24`, `Rgba`, `Bgra`, +//! `Rgba1010102`, etc.) — follow‑up. Will land as their own +//! family of `*_to` kernels feeding a new row‑shape subtrait. mod nv12; mod nv21; mod p010; +mod p012; mod yuv420p; mod yuv420p10; +mod yuv420p12; +mod yuv420p14; pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to}; pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to}; pub use p010::{P010, P010Row, P010Sink, p010_to}; +pub use p012::{P012, P012Row, P012Sink, p012_to}; pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to}; pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to}; +pub use yuv420p12::{Yuv420p12, Yuv420p12Row, Yuv420p12Sink, yuv420p12_to}; +pub use yuv420p14::{Yuv420p14, Yuv420p14Row, Yuv420p14Sink, yuv420p14_to}; diff --git a/src/yuv/p012.rs b/src/yuv/p012.rs new file mode 100644 index 0000000..b7b058e --- /dev/null +++ b/src/yuv/p012.rs @@ -0,0 +1,152 @@ +//! P012 — semi‑planar 4:2:0, 12‑bit, high‑bit‑packed +//! (`AV_PIX_FMT_P012LE`). +//! +//! Storage is a 2‑plane layout identical to [`super::P010`]: one full‑ +//! size Y plane plus one interleaved UV plane at half width and half +//! height. Sample width is `u16` with the 12 active bits in the +//! **high** 12 positions of each element (`sample = value << 4`), low +//! 4 bits zero. This is the 12‑bit sibling of Microsoft's P010 +//! convention and what HEVC Main 12 / VP9 Profile 3 hardware decoders +//! emit. +//! +//! Conversion semantics mirror [`super::P010`] on the layout side and +//! [`super::Yuv420p12`] on the Q‑math side: two consecutive Y rows +//! share one UV row (4:2:0), chroma is nearest‑neighbor upsampled in +//! registers inside the row primitive, and every SIMD backend shifts +//! each `u16` load right by 4 (= `16 - BITS` with `BITS == 12`) to +//! extract the 12‑bit value before running the same Q15 pipeline used +//! by [`super::P010`]. + +use crate::{ColorMatrix, PixelSink, SourceFormat, frame::P012Frame, sealed::Sealed}; + +/// Zero‑sized marker for the P012 source format. Used as the `F` type +/// parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct P012; + +impl Sealed for P012 {} +impl SourceFormat for P012 {} + +/// One output row of a P012 source handed to a [`P012Sink`]. +/// +/// Accessors: +/// - [`y`](Self::y) — full‑width Y row (`width` `u16` samples, high‑ +/// bit‑packed). +/// - [`uv_half`](Self::uv_half) — **interleaved, half‑width** UV row +/// (`width` `u16` elements = `width / 2` U/V pairs, U first). The +/// row primitive deinterleaves and upsamples in‑register. +/// - [`row`](Self::row) — output row index (`0 ..= frame.height() - 1`). +/// - [`matrix`](Self::matrix), [`full_range`](Self::full_range) — +/// carried through from the kernel call. +#[derive(Debug, Clone, Copy)] +pub struct P012Row<'a> { + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> P012Row<'a> { + /// Bundles one row of a P012 source for a [`P012Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + pub(crate) fn new( + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + uv_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples, high‑bit‑packed + /// (12 active bits in the high 12 of each element). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Interleaved UV row — `width` `u16` elements laid out as + /// `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. Each element is + /// high‑bit‑packed. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn uv_half(&self) -> &'a [u16] { + self.uv_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit, + /// scaled into the high 12 bits of each `u16`); `false` for limited + /// range. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume P012 rows. +/// +/// A subtrait of [`PixelSink`] that pins the row shape to +/// [`P012Row`]. Implementors get +/// `process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error>` +/// via the supertrait. +pub trait P012Sink: for<'a> PixelSink = P012Row<'a>> {} + +/// Converts a P012 frame by walking its rows and feeding each one to +/// the [`P012Sink`]. +/// +/// The kernel is a pure row walker — no color arithmetic happens +/// here. Slice math picks the Y row and the correct UV row for each +/// output row (`chroma_row = row / 2` for 4:2:0) and hands borrows to +/// the Sink. The Sink decides what to derive and where to write. +pub fn p012_to( + src: &P012Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let uv_stride = src.uv_stride() as usize; + // UV row payload is `width` `u16` elements — `width / 2` interleaved + // U/V pairs. + let uv_row_elems = w; + + let y_plane = src.y(); + let uv_plane = src.uv(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + // 4:2:0 chroma subsampling: two consecutive Y rows share one UV + // row. + let chroma_row = row / 2; + let uv_start = chroma_row * uv_stride; + let uv_half = &uv_plane[uv_start..uv_start + uv_row_elems]; + + sink.process(P012Row::new(y, uv_half, row, matrix, full_range))?; + } + Ok(()) +} diff --git a/src/yuv/yuv420p10.rs b/src/yuv/yuv420p10.rs index 1a85e06..812f180 100644 --- a/src/yuv/yuv420p10.rs +++ b/src/yuv/yuv420p10.rs @@ -4,14 +4,14 @@ //! plus U / V at half width and half height — but sample width is //! **`u16`** (10 active bits in the low bits of each element). The //! [`Yuv420p10Frame`] type alias pins the bit depth; the underlying -//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` so 12‑bit -//! and 14‑bit variants can be added by relaxing its validator without -//! changing kernel math. +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` and the +//! 12‑bit / 14‑bit siblings ([`super::Yuv420p12`] / [`super::Yuv420p14`]) +//! reuse the same scalar + SIMD kernel family with a different +//! monomorphization. //! -//! Ships in colconv v0.2 as the first high‑bit‑depth format (HDR / -//! 10‑bit SDR keystone). Kernel semantics match [`super::Yuv420p`]: -//! two consecutive Y rows share one chroma row (4:2:0), chroma is -//! nearest‑neighbor upsampled in registers inside the row primitive. +//! Kernel semantics match [`super::Yuv420p`]: two consecutive Y rows +//! share one chroma row (4:2:0), chroma is nearest‑neighbor upsampled +//! in registers inside the row primitive. use crate::{ ColorMatrix, PixelSink, SourceFormat, @@ -22,10 +22,11 @@ use crate::{ /// Zero‑sized marker for the YUV 4:2:0 **10‑bit** source format. Used /// as the `F` type parameter on [`crate::sinker::MixedSinker`]. /// -/// colconv v0.2 ships only the 10‑bit specialization; 12‑ and 14‑bit -/// will arrive as separate markers (`Yuv420p12`, `Yuv420p14`) that -/// refer to the same underlying [`Yuv420pFrame16`] struct with -/// different `BITS` values. +/// 12‑bit and 14‑bit siblings ship as separate markers +/// ([`super::Yuv420p12`] / [`super::Yuv420p14`]) on the same +/// [`Yuv420pFrame16`] struct with different `BITS` values. 16‑bit +/// needs a different kernel family (Q15 chroma_sum overflows i32) and +/// is not yet shipped. #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] pub struct Yuv420p10; diff --git a/src/yuv/yuv420p12.rs b/src/yuv/yuv420p12.rs new file mode 100644 index 0000000..5995c6c --- /dev/null +++ b/src/yuv/yuv420p12.rs @@ -0,0 +1,161 @@ +//! YUV 4:2:0 planar 12‑bit (`AV_PIX_FMT_YUV420P12LE`). +//! +//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size +//! plus U / V at half width and half height — with **`u16`** samples +//! (12 active bits in the **low** 12 of each element, upper 4 zero). +//! The [`Yuv420p12Frame`] type alias pins the bit depth; the underlying +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same +//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` runs +//! unchanged against the 12‑bit instantiation. +//! +//! Ships in colconv v0.2a alongside [`super::Yuv420p14`] and +//! [`super::P012`]. Kernel semantics match [`super::Yuv420p10`]: two +//! consecutive Y rows share one chroma row (4:2:0), chroma is +//! nearest‑neighbor upsampled in registers inside the row primitive, +//! and Q15 intermediates stay in i32 (chroma_sum < 10⁹ < i32 max at 12 +//! bits — verified against the scalar reference per SIMD backend). + +use crate::{ + ColorMatrix, PixelSink, SourceFormat, + frame::{Yuv420p12Frame, Yuv420pFrame16}, + sealed::Sealed, +}; + +/// Zero‑sized marker for the YUV 4:2:0 **12‑bit** source format. Used +/// as the `F` type parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct Yuv420p12; + +impl Sealed for Yuv420p12 {} +impl SourceFormat for Yuv420p12 {} + +/// One output row of a 12‑bit YUV 4:2:0 source handed to a +/// [`Yuv420p12Sink`]. Structurally identical to [`super::Yuv420p10Row`], +/// just with values in `[0, 4095]` instead of `[0, 1023]`. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420p12Row<'a> { + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> Yuv420p12Row<'a> { + /// Bundles one row of a 12‑bit 4:2:0 source for a [`Yuv420p12Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + u_half, + v_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Half‑width U (Cb) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn u_half(&self) -> &'a [u16] { + self.u_half + } + + /// Half‑width V (Cr) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn v_half(&self) -> &'a [u16] { + self.v_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 4095]` for 12‑bit); + /// `false` for limited range (`[256, 3760]` luma, `[256, 3840]` + /// chroma — the 8‑bit `[16, 235]` / `[16, 240]` ranges scaled by 16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume 12‑bit YUV 4:2:0 rows. +pub trait Yuv420p12Sink: for<'a> PixelSink = Yuv420p12Row<'a>> {} + +/// Converts a 12‑bit YUV 4:2:0 frame by walking its rows and feeding +/// each one to the [`Yuv420p12Sink`]. Mirrors [`super::yuv420p10_to`] — +/// pure row walker, all color arithmetic happens inside the Sink via +/// the crate's row primitives instantiated at `BITS == 12`. +pub fn yuv420p12_to( + src: &Yuv420p12Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + yuv420p12_walker::<12, S>(src, full_range, matrix, sink) +} + +/// Row walker for the 12‑bit YUV 4:2:0 source. `BITS` is a const +/// generic so [`Yuv420pFrame16`] geometry reads (stride, plane +/// slicing) are monomorphized; the row/sink types bound below are +/// still pinned to the 12‑bit variants. +#[cfg_attr(not(tarpaulin), inline(always))] +fn yuv420p12_walker( + src: &Yuv420pFrame16<'_, BITS>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let u_stride = src.u_stride() as usize; + let v_stride = src.v_stride() as usize; + let chroma_width = w / 2; + + let y_plane = src.y(); + let u_plane = src.u(); + let v_plane = src.v(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + let chroma_row = row / 2; + let u_start = chroma_row * u_stride; + let v_start = chroma_row * v_stride; + let u_half = &u_plane[u_start..u_start + chroma_width]; + let v_half = &v_plane[v_start..v_start + chroma_width]; + + sink.process(Yuv420p12Row::new( + y, u_half, v_half, row, matrix, full_range, + ))?; + } + Ok(()) +} diff --git a/src/yuv/yuv420p14.rs b/src/yuv/yuv420p14.rs new file mode 100644 index 0000000..27c54ee --- /dev/null +++ b/src/yuv/yuv420p14.rs @@ -0,0 +1,159 @@ +//! YUV 4:2:0 planar 14‑bit (`AV_PIX_FMT_YUV420P14LE`). +//! +//! Storage mirrors [`super::Yuv420p10`] — three planes, Y at full size +//! plus U / V at half width and half height — with **`u16`** samples +//! (14 active bits in the **low** 14 of each element, upper 2 zero). +//! The [`Yuv420p14Frame`] type alias pins the bit depth; the underlying +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS`, so the same +//! Q15 scalar + SIMD kernel family that powers `Yuv420p10` / +//! `Yuv420p12` runs unchanged against the 14‑bit instantiation. +//! +//! Kernel math constraint: at 14 bits, chroma_sum still fits in i32 +//! (~10⁹ ≤ 2³¹), so the Q15 pipeline stays unchanged. 16‑bit would +//! overflow and needs a separate kernel family. + +use crate::{ + ColorMatrix, PixelSink, SourceFormat, + frame::{Yuv420p14Frame, Yuv420pFrame16}, + sealed::Sealed, +}; + +/// Zero‑sized marker for the YUV 4:2:0 **14‑bit** source format. Used +/// as the `F` type parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct Yuv420p14; + +impl Sealed for Yuv420p14 {} +impl SourceFormat for Yuv420p14 {} + +/// One output row of a 14‑bit YUV 4:2:0 source handed to a +/// [`Yuv420p14Sink`]. Structurally identical to [`super::Yuv420p10Row`], +/// just with values in `[0, 16383]` instead of `[0, 1023]`. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420p14Row<'a> { + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> Yuv420p14Row<'a> { + /// Bundles one row of a 14‑bit 4:2:0 source for a [`Yuv420p14Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + u_half, + v_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Half‑width U (Cb) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn u_half(&self) -> &'a [u16] { + self.u_half + } + + /// Half‑width V (Cr) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn v_half(&self) -> &'a [u16] { + self.v_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 16383]` for + /// 14‑bit); `false` for limited range (`[1024, 15040]` luma, + /// `[1024, 15360]` chroma — the 8‑bit `[16, 235]` / `[16, 240]` + /// ranges scaled by 64). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume 14‑bit YUV 4:2:0 rows. +pub trait Yuv420p14Sink: for<'a> PixelSink = Yuv420p14Row<'a>> {} + +/// Converts a 14‑bit YUV 4:2:0 frame by walking its rows and feeding +/// each one to the [`Yuv420p14Sink`]. Mirrors [`super::yuv420p10_to`] — +/// pure row walker, all color arithmetic happens inside the Sink via +/// the crate's row primitives instantiated at `BITS == 14`. +pub fn yuv420p14_to( + src: &Yuv420p14Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + yuv420p14_walker::<14, S>(src, full_range, matrix, sink) +} + +/// Row walker for the 14‑bit YUV 4:2:0 source. `BITS` is a const +/// generic so [`Yuv420pFrame16`] geometry reads (stride, plane +/// slicing) are monomorphized; the row/sink types bound below are +/// still pinned to the 14‑bit variants. +#[cfg_attr(not(tarpaulin), inline(always))] +fn yuv420p14_walker( + src: &Yuv420pFrame16<'_, BITS>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let u_stride = src.u_stride() as usize; + let v_stride = src.v_stride() as usize; + let chroma_width = w / 2; + + let y_plane = src.y(); + let u_plane = src.u(); + let v_plane = src.v(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + let chroma_row = row / 2; + let u_start = chroma_row * u_stride; + let v_start = chroma_row * v_stride; + let u_half = &u_plane[u_start..u_start + chroma_width]; + let v_half = &v_plane[v_start..v_start + chroma_width]; + + sink.process(Yuv420p14Row::new( + y, u_half, v_half, row, matrix, full_range, + ))?; + } + Ok(()) +}