diff --git a/Cargo.toml b/Cargo.toml index 2f34a10..458d138 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,10 @@ harness = false name = "yuv_420p10_to_rgb" harness = false +[[bench]] +name = "p010_to_rgb" +harness = false + [[bench]] name = "rgb_to_hsv" harness = false diff --git a/benches/p010_to_rgb.rs b/benches/p010_to_rgb.rs new file mode 100644 index 0000000..6466b64 --- /dev/null +++ b/benches/p010_to_rgb.rs @@ -0,0 +1,106 @@ +//! Per‑row P010 (semi‑planar 4:2:0, 10‑bit, high‑bit‑packed) → RGB +//! throughput baseline. +//! +//! Mirrors [`yuv_420p10_to_rgb`] — two output paths per width: +//! - `u8_*` — P010 → packed 8‑bit RGB (hot path for scene / keyframe +//! detection). +//! - `u16_*` — P010 → native‑depth 10‑bit RGB in `u16` storage +//! (lossless, for HDR tone mapping). +//! +//! Each width gets a `scalar` vs `simd` pair so the SIMD speedup on +//! whichever backend the dispatcher selects is a two‑line comparison +//! in the Criterion report. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{p010_to_rgb_row, p010_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic P010‑packed pseudo‑random +/// sequence — 10‑bit values shifted into the high 10 bits of each +/// `u16` (low 6 bits zero), matching the real P010 storage layout. +fn fill_pseudo_random_p010(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = (((state >> 8) & 0x3FF) as u16) << 6; + } +} + +fn bench(c: &mut Criterion) { + // 720p / 1080p / 4K widths — multiples of 64 so the widest SIMD + // tier (AVX‑512, 64 pixels per iteration) covers each block fully. + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + // ---- u8 output ------------------------------------------------------ + let mut group_u8 = c.benchmark_group("p010_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + // UV row payload is `width` u16 elements (w / 2 interleaved pairs). + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p010(&mut y, 0x1111); + fill_pseudo_random_p010(&mut uv, 0x2222); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p010_to_rgb_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + // ---- u16 native-depth output ---------------------------------------- + let mut group_u16 = c.benchmark_group("p010_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut uv = std::vec![0u16; w]; + fill_pseudo_random_p010(&mut y, 0x1111); + fill_pseudo_random_p010(&mut uv, 0x2222); + let mut rgb = std::vec![0u16; w * 3]; + + // u16 output writes 2× the bytes of u8. + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + p010_to_rgb_u16_row( + black_box(&y), + black_box(&uv), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/src/frame.rs b/src/frame.rs index 585d59f..8becc63 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -463,6 +463,369 @@ pub enum Nv12FrameError { }, } +/// A validated P010 (semi‑planar 4:2:0, 10‑bit `u16`) frame. +/// +/// The canonical layout emitted by Apple VideoToolbox, VA‑API, NVDEC, +/// D3D11VA, and Intel QSV for 10‑bit HDR hardware‑decoded output. Same +/// plane shape as [`Nv12Frame`] — one full‑size luma plane plus one +/// interleaved UV plane at half width and half height — but sample +/// width is **`u16`** and the 10 active bits sit in the **high** 10 of +/// each element (`sample = value << 6`, low 6 bits zero). That matches +/// Microsoft's P010 convention and FFmpeg's `AV_PIX_FMT_P010LE`. +/// +/// This is **not** the [`Yuv420p10Frame`] layout — yuv420p10le puts the +/// 10 bits in the **low** 10 of each `u16`. Callers holding a P010 +/// buffer must use [`P010Frame`]; callers holding yuv420p10le must use +/// [`Yuv420p10Frame`]. Kernels mask/shift appropriately for each. +/// +/// Stride is in **samples** (`u16` elements), not bytes. Users holding +/// an FFmpeg byte buffer should cast via [`bytemuck::cast_slice`] and +/// divide `linesize[i]` by 2 before constructing. +/// +/// Two planes: +/// - `y` — full‑size luma, `y_stride >= width`, length +/// `>= y_stride * height` (all in `u16` samples). +/// - `uv` — interleaved chroma (`U0, V0, U1, V1, …`) at half width and +/// half height, so each UV row carries `2 * ceil(width / 2) = width` +/// `u16` elements; `uv_stride >= width`, length +/// `>= uv_stride * ceil(height / 2)`. +/// +/// `width` must be even (same 4:2:0 rationale as the other frame +/// types); `height` may be odd (handled via `height.div_ceil(2)` in +/// chroma‑row sizing). +/// +/// # Input sample range and packing sanity +/// +/// Each `u16` sample's 10 active bits live in the high 10 positions; +/// the low 6 bits are expected to be zero. [`Self::try_new`] validates +/// geometry only. +/// +/// [`Self::try_new_checked`] additionally scans every sample and +/// rejects any with non‑zero low 6 bits — a **necessary but not +/// sufficient** packing sanity check. It catches mispacked +/// `yuv420p10le` buffers as long as **at least one** sample has +/// low‑bit content (the usual case for noisy real‑world image data), +/// but it **cannot distinguish** P010 from a `yuv420p10le` buffer +/// whose samples all happen to be multiples of 64. Values like +/// `Y = 64` (limited‑range black) and `UV = 512` (neutral chroma) +/// both have low 6 bits zero and so pass the check, even though the +/// buffer layout is wrong. For strict provenance, callers must rely +/// on their source format metadata and pick the right frame type +/// ([`P010Frame`] vs [`Yuv420p10Frame`]) at construction. +/// +/// Kernels shift each load right by 6 to extract the 10‑bit value, +/// so mispacked input (e.g. a `yuv420p10le` buffer handed to the +/// P010 kernel) produces deterministic, backend‑independent output +/// — wrong colors, but consistently wrong across scalar + every +/// SIMD backend, which is visible in any output diff. +#[derive(Debug, Clone, Copy)] +pub struct P010Frame<'a> { + y: &'a [u16], + uv: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, +} + +impl<'a> P010Frame<'a> { + /// Constructs a new [`P010Frame`], validating dimensions and plane + /// lengths. Strides are in `u16` **samples**. + /// + /// Returns [`P010FrameError`] if any of: + /// - `width` or `height` is zero, + /// - `width` is odd, + /// - `y_stride < width`, + /// - `uv_stride < width` (the UV row holds `width / 2` interleaved + /// pairs = `width` `u16` elements), + /// - either plane is too short, or + /// - `stride * rows` overflows `usize` (32‑bit targets only). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn try_new( + y: &'a [u16], + uv: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result { + if width == 0 || height == 0 { + return Err(P010FrameError::ZeroDimension { width, height }); + } + if width & 1 != 0 { + return Err(P010FrameError::OddWidth { width }); + } + if y_stride < width { + return Err(P010FrameError::YStrideTooSmall { width, y_stride }); + } + let uv_row_elems = width; + if uv_stride < uv_row_elems { + return Err(P010FrameError::UvStrideTooSmall { + uv_row_elems, + uv_stride, + }); + } + + let y_min = match (y_stride as usize).checked_mul(height as usize) { + Some(v) => v, + None => { + return Err(P010FrameError::GeometryOverflow { + stride: y_stride, + rows: height, + }); + } + }; + if y.len() < y_min { + return Err(P010FrameError::YPlaneTooShort { + expected: y_min, + actual: y.len(), + }); + } + let chroma_height = height.div_ceil(2); + let uv_min = match (uv_stride as usize).checked_mul(chroma_height as usize) { + Some(v) => v, + None => { + return Err(P010FrameError::GeometryOverflow { + stride: uv_stride, + rows: chroma_height, + }); + } + }; + if uv.len() < uv_min { + return Err(P010FrameError::UvPlaneTooShort { + expected: uv_min, + actual: uv.len(), + }); + } + + Ok(Self { + y, + uv, + width, + height, + y_stride, + uv_stride, + }) + } + + /// Constructs a new [`P010Frame`], panicking on invalid inputs. + /// Prefer [`Self::try_new`] when inputs may be invalid at runtime. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn new( + y: &'a [u16], + uv: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Self { + match Self::try_new(y, uv, width, height, y_stride, uv_stride) { + Ok(frame) => frame, + Err(_) => panic!("invalid P010Frame dimensions or plane lengths"), + } + } + + /// Like [`Self::try_new`] but additionally scans every sample and + /// rejects any whose **low 6 bits** are non‑zero. A valid P010 + /// sample has its 10 active bits in the high 10 positions and zero + /// below, so non‑zero low bits is evidence the buffer isn't P010. + /// + /// **This is a packing sanity check, not a provenance validator.** + /// The check catches noisy `yuv420p10le` data (where most samples + /// have low‑bit content), but it **cannot** distinguish P010 from + /// a `yuv420p10le` buffer whose samples all happen to be multiples + /// of 64. Common flat‑region values like `Y = 64` (limited‑range + /// black) or `UV = 512` (neutral chroma) are multiples of 64 in + /// both layouts, so a yuv420p10le buffer of flat content will + /// silently pass this check. Callers who need strict provenance + /// must rely on their source format metadata and pick the right + /// frame type at construction ([`P010Frame`] vs [`Yuv420p10Frame`]); + /// no runtime check on opaque `u16` data can reliably tell the two + /// layouts apart. + /// + /// Cost: one O(plane_size) scan per plane. The default + /// [`Self::try_new`] skips this so the hot path stays O(1). + /// + /// Returns [`P010FrameError::SampleLowBitsSet`] on the first + /// offending sample — carries the plane, element index, and + /// offending value. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn try_new_checked( + y: &'a [u16], + uv: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result { + let frame = Self::try_new(y, uv, width, height, y_stride, uv_stride)?; + let w = width as usize; + let h = height as usize; + let uv_w = w; // interleaved: `width / 2` pairs × 2 elements + let chroma_h = height.div_ceil(2) as usize; + for row in 0..h { + let start = row * y_stride as usize; + for (col, &s) in y[start..start + w].iter().enumerate() { + if s & 0x3F != 0 { + return Err(P010FrameError::SampleLowBitsSet { + plane: P010FramePlane::Y, + index: start + col, + value: s, + }); + } + } + } + for row in 0..chroma_h { + let start = row * uv_stride as usize; + for (col, &s) in uv[start..start + uv_w].iter().enumerate() { + if s & 0x3F != 0 { + return Err(P010FrameError::SampleLowBitsSet { + plane: P010FramePlane::Uv, + index: start + col, + value: s, + }); + } + } + } + Ok(frame) + } + + /// Y (luma) plane samples. Row `r` starts at sample offset + /// `r * y_stride()`. Each sample's 10 active bits sit in the **high** + /// 10 positions of the `u16` (low 6 bits zero). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn y(&self) -> &'a [u16] { + self.y + } + + /// Interleaved UV plane samples. Each chroma row starts at sample + /// offset `chroma_row * uv_stride()` and contains `width` `u16` + /// elements laid out as `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. + /// Each element's 10 active bits sit in the high 10 positions. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn uv(&self) -> &'a [u16] { + self.uv + } + + /// Frame width in pixels. Always even. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn width(&self) -> u32 { + self.width + } + + /// Frame height in pixels. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn height(&self) -> u32 { + self.height + } + + /// Sample stride of the Y plane (`>= width`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn y_stride(&self) -> u32 { + self.y_stride + } + + /// Sample stride of the interleaved UV plane (`>= width`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn uv_stride(&self) -> u32 { + self.uv_stride + } +} + +/// Identifies which plane of a [`P010Frame`] a +/// [`P010FrameError::SampleLowBitsSet`] refers to. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)] +pub enum P010FramePlane { + /// Luma plane. + Y, + /// Interleaved UV plane. + Uv, +} + +/// Errors returned by [`P010Frame::try_new`] and +/// [`P010Frame::try_new_checked`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)] +#[non_exhaustive] +pub enum P010FrameError { + /// `width` or `height` was zero. + #[error("width ({width}) or height ({height}) is zero")] + ZeroDimension { + /// The supplied width. + width: u32, + /// The supplied height. + height: u32, + }, + /// `width` was odd. Same 4:2:0 rationale as the other semi‑planar + /// formats. + #[error("width ({width}) is odd; 4:2:0 requires even width")] + OddWidth { + /// The supplied width. + width: u32, + }, + /// `y_stride < width` (in `u16` samples). + #[error("y_stride ({y_stride}) is smaller than width ({width})")] + YStrideTooSmall { + /// Declared frame width in pixels. + width: u32, + /// The supplied Y‑plane stride (samples). + y_stride: u32, + }, + /// `uv_stride` is smaller than the `width` `u16` elements of + /// interleaved UV payload one chroma row must hold. + #[error("uv_stride ({uv_stride}) is smaller than UV row payload ({uv_row_elems} u16 elements)")] + UvStrideTooSmall { + /// Required minimum UV‑plane stride (`= width`). + uv_row_elems: u32, + /// The supplied UV‑plane stride (samples). + uv_stride: u32, + }, + /// Y plane is shorter than `y_stride * height` samples. + #[error("Y plane has {actual} samples but at least {expected} are required")] + YPlaneTooShort { + /// Minimum samples required. + expected: usize, + /// Actual samples supplied. + actual: usize, + }, + /// UV plane is shorter than `uv_stride * ceil(height / 2)` samples. + #[error("UV plane has {actual} samples but at least {expected} are required")] + UvPlaneTooShort { + /// Minimum samples required. + expected: usize, + /// Actual samples supplied. + actual: usize, + }, + /// `stride * rows` overflows `usize` (32‑bit targets only). + #[error("declared geometry overflows usize: stride={stride} * rows={rows}")] + GeometryOverflow { + /// Stride of the plane whose size overflowed. + stride: u32, + /// Row count that overflowed against the stride. + rows: u32, + }, + /// A sample's low 6 bits were non‑zero — P010 packs its 10 active + /// bits in the high 10 of each `u16`, so valid samples are always + /// multiples of 64 (`value << 6`). Only + /// [`P010Frame::try_new_checked`] can produce this error. + /// + /// Note: the absence of this error does **not** prove the buffer + /// is P010. A `yuv420p10le` buffer of samples that all happen to + /// be multiples of 64 (e.g. `Y = 64`, `UV = 512`) passes the + /// check silently. See [`P010Frame::try_new_checked`] for the + /// full discussion. + #[error( + "sample {value:#06x} on plane {plane} at element {index} has non-zero low 6 bits (not a valid P010 sample)" + )] + SampleLowBitsSet { + /// Which plane the offending sample lives on. + plane: P010FramePlane, + /// Element index within that plane's slice. + index: usize, + /// The offending sample value. + value: u16, + }, +} + /// A validated NV21 (semi‑planar 4:2:0) frame. /// /// Structurally identical to [`Nv12Frame`] — one full-size luma plane @@ -1764,4 +2127,177 @@ mod tests { let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. })); } + + // ---- P010Frame --------------------------------------------------------- + // + // Semi‑planar 10‑bit. Plane shape mirrors Nv12Frame (Y + interleaved + // UV) but sample width is `u16` with the 10 active bits in the + // **high** 10 of each element (`value << 6`). Strides are in + // samples, not bytes. + + fn p010_planes() -> (std::vec::Vec, std::vec::Vec) { + // 16×8 frame — UV plane carries 16 u16 × 4 chroma rows = 64 u16. + // P010 white Y = 1023 << 6 = 0xFFC0; neutral UV = 512 << 6 = 0x8000. + (std::vec![0xFFC0u16; 16 * 8], std::vec![0x8000u16; 16 * 4]) + } + + #[test] + fn p010_try_new_accepts_valid_tight() { + let (y, uv) = p010_planes(); + let f = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).expect("valid"); + assert_eq!(f.width(), 16); + assert_eq!(f.height(), 8); + assert_eq!(f.uv_stride(), 16); + } + + #[test] + fn p010_try_new_accepts_odd_height() { + // 640×481 — same concrete odd‑height case covered by NV12 / NV21. + let y = std::vec![0u16; 640 * 481]; + let uv = std::vec![0x8000u16; 640 * 241]; + let f = P010Frame::try_new(&y, &uv, 640, 481, 640, 640).expect("odd height valid"); + assert_eq!(f.height(), 481); + } + + #[test] + fn p010_try_new_rejects_odd_width() { + let (y, uv) = p010_planes(); + let e = P010Frame::try_new(&y, &uv, 15, 8, 16, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::OddWidth { width: 15 })); + } + + #[test] + fn p010_try_new_rejects_zero_dim() { + let (y, uv) = p010_planes(); + let e = P010Frame::try_new(&y, &uv, 0, 8, 16, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::ZeroDimension { .. })); + } + + #[test] + fn p010_try_new_rejects_y_stride_under_width() { + let (y, uv) = p010_planes(); + let e = P010Frame::try_new(&y, &uv, 16, 8, 8, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::YStrideTooSmall { .. })); + } + + #[test] + fn p010_try_new_rejects_uv_stride_under_width() { + let (y, uv) = p010_planes(); + let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 8).unwrap_err(); + assert!(matches!(e, P010FrameError::UvStrideTooSmall { .. })); + } + + #[test] + fn p010_try_new_rejects_short_y_plane() { + let y = std::vec![0u16; 10]; + let uv = std::vec![0x8000u16; 16 * 4]; + let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + } + + #[test] + fn p010_try_new_rejects_short_uv_plane() { + let y = std::vec![0u16; 16 * 8]; + let uv = std::vec![0x8000u16; 8]; + let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::UvPlaneTooShort { .. })); + } + + #[test] + #[should_panic(expected = "invalid P010Frame")] + fn p010_new_panics_on_invalid() { + let y = std::vec![0u16; 10]; + let uv = std::vec![0x8000u16; 16 * 4]; + let _ = P010Frame::new(&y, &uv, 16, 8, 16, 16); + } + + #[cfg(target_pointer_width = "32")] + #[test] + fn p010_try_new_rejects_geometry_overflow() { + let big: u32 = 0x1_0000; + let y: [u16; 0] = []; + let uv: [u16; 0] = []; + let e = P010Frame::try_new(&y, &uv, big, big, big, big).unwrap_err(); + assert!(matches!(e, P010FrameError::GeometryOverflow { .. })); + } + + #[test] + fn p010_try_new_checked_accepts_shifted_samples() { + // Valid P010 samples: low 6 bits zero. + let (y, uv) = p010_planes(); + P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).expect("shifted samples valid"); + } + + #[test] + fn p010_try_new_checked_rejects_y_low_bits_set() { + // A Y sample with low 6 bits set — characteristic of yuv420p10le + // packing (value in low 10 bits) accidentally handed to the P010 + // constructor. `try_new_checked` catches this; plain `try_new` + // would let the kernel mask it down and produce wrong colors. + let mut y = std::vec![0xFFC0u16; 16 * 8]; + y[3 * 16 + 5] = 0x03FF; // 10-bit value in low bits — wrong packing + let uv = std::vec![0x8000u16; 16 * 4]; + let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + match e { + P010FrameError::SampleLowBitsSet { plane, value, .. } => { + assert_eq!(plane, P010FramePlane::Y); + assert_eq!(value, 0x03FF); + } + other => panic!("expected SampleLowBitsSet, got {other:?}"), + } + } + + #[test] + fn p010_try_new_checked_rejects_uv_plane_sample() { + let y = std::vec![0xFFC0u16; 16 * 8]; + let mut uv = std::vec![0x8000u16; 16 * 4]; + uv[2 * 16 + 3] = 0x0001; // low bit set + let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!( + e, + P010FrameError::SampleLowBitsSet { + plane: P010FramePlane::Uv, + value: 0x0001, + .. + } + )); + } + + #[test] + fn p010_try_new_checked_reports_geometry_errors_first() { + let y = std::vec![0u16; 10]; // Too small. + let uv = std::vec![0x8000u16; 16 * 4]; + let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!(e, P010FrameError::YPlaneTooShort { .. })); + } + + /// Regression documenting a **known limitation** of + /// [`P010Frame::try_new_checked`]: the low‑6‑bits‑zero check is a + /// packing sanity check, not a provenance validator. A + /// `yuv420p10le` buffer whose samples all happen to be multiples + /// of 64 — e.g. `Y = 64` (limited‑range black, `0x0040`) and + /// `UV = 512` (neutral chroma, `0x0200`) — passes the check + /// silently, even though the layout is wrong and downstream P010 + /// kernels will produce incorrect output. + /// + /// The test asserts the check accepts these values so the limit + /// is visible in the test log; any future attempt to tighten the + /// constructor into a real provenance validator will need to + /// update or replace this test. + #[test] + fn p010_try_new_checked_accepts_ambiguous_yuv420p10le_samples() { + // `yuv420p10le`-style samples, all multiples of 64: low 6 bits + // are zero, so they pass the P010 sanity check even though this + // is wrong data for a P010 frame. + let y = std::vec![0x0040u16; 16 * 8]; // limited-range black in 10-bit low-packed + let uv = std::vec![0x0200u16; 16 * 4]; // neutral chroma in 10-bit low-packed + let f = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("known limitation: low-6-bits-zero check cannot tell yuv420p10le from P010"); + assert_eq!(f.width(), 16); + // Downstream decoding of this frame would produce wrong colors + // (every `>> 6` extracts 1 from Y=0x0040 and 8 from UV=0x0200, + // which P010 kernels then bias/scale as if those were the 10-bit + // source values). That's accepted behavior — the type system, + // not `try_new_checked`, is what keeps yuv420p10le out of P010. + } } diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 5cefb9e..f98d9cd 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -37,10 +37,10 @@ use core::arch::aarch64::{ vaddq_s32, vandq_u16, vbslq_f32, vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16, vcvtq_f32_u32, vcvtq_u32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_u16, vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, - vld1q_u8, vld1q_u16, vld2_u8, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16, - vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, vqaddq_s16, - vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, vst1q_u8, - vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, + vld1q_u8, vld1q_u16, vld2_u8, vld2q_u16, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, + vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, + vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, + vshrq_n_u16, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, }; use crate::{ColorMatrix, row::scalar}; @@ -488,6 +488,267 @@ fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) } } +/// NEON P010 → packed **8‑bit** RGB. +/// +/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences +/// from [`yuv420p10_to_rgb_row`]: +/// - UV is semi‑planar interleaved (`U0, V0, U1, V1, …`), split in +/// one shot via `vld2q_u16` (returns separate U and V vectors). +/// - Each `u16` load is **shifted right by 6** (`vshrq_n_u16::<6>`) +/// instead of AND‑masked — P010 packs its 10 active bits in the +/// HIGH 10 of each `u16`, so `>> 6` extracts the value and +/// simultaneously clears the low 6 bits (which the format mandates +/// are zero anyway; the shift makes mispacked input deterministic). +/// - Chroma bias is 512 (10‑bit center) after the shift. +/// +/// After the shift, the rest of the pipeline is identical to the +/// `yuv420p10` path — same `chroma_i16x8` / `scale_y` / `chroma_dup` +/// / `vst3q_u8` write, with `range_params_n::<10, 8>` scaling. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: NEON availability is the caller's obligation. + unsafe { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // 16 Y pixels in two u16x8 loads, shifted right by 6 to extract + // the 10‑bit values from P010's high‑bit packing. + let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); + let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + + // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements + // and returns (evens, odds) = (U, V) in one shot. Each gets the + // same `>> 6` shift as Y. + let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); + let u_vec = vshrq_n_u16::<6>(uv_pair.0); + let v_vec = vshrq_n_u16::<6>(uv_pair.1); + + let y_lo = vreinterpretq_s16_u16(y_vec_lo); + let y_hi = vreinterpretq_s16_u16(y_vec_hi); + + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = vzip1q_s16(r_chroma, r_chroma); + let r_dup_hi = vzip2q_s16(r_chroma, r_chroma); + let g_dup_lo = vzip1q_s16(g_chroma, g_chroma); + let g_dup_hi = vzip2q_s16(g_chroma, g_chroma); + let b_dup_lo = vzip1q_s16(b_chroma, b_chroma); + let b_dup_hi = vzip2q_s16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); + + let b_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, b_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)), + ); + let g_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, g_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, g_dup_hi)), + ); + let r_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, r_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), + ); + + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// NEON P010 → packed **10‑bit `u16`** RGB (native‑depth, low‑bit‑ +/// packed output — `yuv420p10le` convention, not P010). +/// +/// Same structure as [`p010_to_rgb_row`] up to the chroma compute; +/// the only differences are: +/// - `range_params_n::<10, 10>` → larger scales targeting the 10‑bit +/// output range. +/// - Clamp is explicit min/max to `[0, 1023]` via +/// [`clamp_u10`](crate::row::arch::neon::clamp_u10). +/// - Writes use two `vst3q_u16` calls per 16‑pixel block. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: NEON availability is the caller's obligation. + unsafe { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let max_v = vdupq_n_s16(OUT_MAX_10); + let zero_v = vdupq_n_s16(0); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x))); + let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8))); + let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); + let u_vec = vshrq_n_u16::<6>(uv_pair.0); + let v_vec = vshrq_n_u16::<6>(uv_pair.1); + + let y_lo = vreinterpretq_s16_u16(y_vec_lo); + let y_hi = vreinterpretq_s16_u16(y_vec_hi); + + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = vzip1q_s16(r_chroma, r_chroma); + let r_dup_hi = vzip2q_s16(r_chroma, r_chroma); + let g_dup_lo = vzip1q_s16(g_chroma, g_chroma); + let g_dup_hi = vzip2q_s16(g_chroma, g_chroma); + let b_dup_lo = vzip1q_s16(b_chroma, b_chroma); + let b_dup_hi = vzip2q_s16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); + + let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10(vqaddq_s16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_u16_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the /// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1619,4 +1880,183 @@ mod tests { } } } + + // ---- P010 NEON scalar-equivalence -------------------------------------- + + /// P010 test samples: 10‑bit values shifted into the high 10 bits + /// (`value << 6`). Deterministic pseudo‑random generator keyed by + /// index × seed so U, V, Y vectors are mutually distinct. + fn p010_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6) + .collect() + } + + /// Interleaves per‑pair U, V samples into P010's semi‑planar UV + /// layout: `[U0, V0, U1, V1, …]`. + fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec { + let pairs = u.len(); + debug_assert_eq!(u.len(), v.len()); + let mut out = std::vec::Vec::with_capacity(pairs * 2); + for i in 0..pairs { + out.push(u[i]); + out.push(v[i]); + } + out + } + + fn check_p010_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p010_plane(width, 37); + let u_plane = p010_plane(width / 2, 53); + let v_plane = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u_plane, &v_plane); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + if rgb_scalar != rgb_neon { + let diff = rgb_scalar + .iter() + .zip(rgb_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "NEON P010→u8 diverges at byte {diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgb_scalar[diff], rgb_neon[diff] + ); + } + } + + fn check_p010_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p010_plane(width, 37); + let u_plane = p010_plane(width / 2, 53); + let v_plane = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u_plane, &v_plane); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + if rgb_scalar != rgb_neon { + let diff = rgb_scalar + .iter() + .zip(rgb_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "NEON P010→u16 diverges at elem {diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgb_scalar[diff], rgb_neon[diff] + ); + } + } + + #[test] + fn neon_p010_u8_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u8_equivalence(16, m, full); + } + } + } + + #[test] + fn neon_p010_u16_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u16_equivalence(16, m, full); + } + } + } + + #[test] + fn neon_p010_matches_scalar_odd_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p010_u8_equivalence(w, ColorMatrix::Bt601, false); + check_p010_u16_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn neon_p010_matches_scalar_1920() { + check_p010_u8_equivalence(1920, ColorMatrix::Bt709, false); + check_p010_u16_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } + + /// Adversarial regression: mispacked input — `yuv420p10le` values + /// (10 bits in low 10) accidentally handed to the P010 kernel, or + /// arbitrary bit corruption — must still produce bit‑identical + /// output on scalar and NEON. The kernel's `>> 6` load extracts + /// only the high 10 bits, so any low‑6‑bits data gets deterministically + /// discarded in both paths. + #[test] + fn neon_p010_matches_scalar_on_mispacked_input() { + let width = 32; + + // Three input variants: + // - `yuv420p10le_style`: values in low 10 bits (wrong packing + // for P010 — `>> 6` drops the actual data, producing near‑black). + // - `noise`: arbitrary 16‑bit noise, no particular pattern. + // - `every_bit`: each sample has every bit set (0xFFFF). + for variant in ["yuv420p10le_style", "noise", "every_bit"] { + let y: std::vec::Vec = match variant { + "every_bit" => std::vec![0xFFFFu16; width], + "yuv420p10le_style" => (0..width).map(|i| ((i * 37 + 11) & 0x3FF) as u16).collect(), + _ => (0..width) + .map(|i| ((i as u32 * 53 + 0xDEAD) as u16) ^ 0xA5A5) + .collect(), + }; + let uv: std::vec::Vec = match variant { + "every_bit" => std::vec![0xFFFFu16; width], + "yuv420p10le_style" => (0..width).map(|i| ((i * 71 + 23) & 0x3FF) as u16).collect(), + _ => (0..width) + .map(|i| ((i as u32 * 91 + 0xBEEF) as u16) ^ 0x5A5A) + .collect(), + }; + + for matrix in [ColorMatrix::Bt601, ColorMatrix::Bt709, ColorMatrix::YCgCo] { + for full_range in [true, false] { + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_neon, + "scalar and NEON diverge on {variant} P010 input (matrix={matrix:?}, full_range={full_range})" + ); + + let mut rgb16_scalar = std::vec![0u16; width * 3]; + let mut rgb16_neon = std::vec![0u16; width * 3]; + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb16_neon, width, matrix, full_range); + } + assert_eq!( + rgb16_scalar, rgb16_neon, + "scalar and NEON diverge on {variant} P010 u16 output (matrix={matrix:?}, full_range={full_range})" + ); + } + } + } + } } diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 8b0175c..21efa7c 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -41,8 +41,8 @@ use core::arch::wasm32::{ i16x8_narrow_i32x4, i16x8_splat, i16x8_sub, i32x4_add, i32x4_extend_high_i16x8, i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr, i32x4_splat, i32x4_trunc_sat_f32x4, u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16, u16x8_extend_low_u8x16, - u16x8_load_extend_u8x8, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, v128, - v128_and, v128_bitselect, v128_load, v128_or, v128_store, + u16x8_load_extend_u8x8, u16x8_shr, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, + v128, v128_and, v128_bitselect, v128_load, v128_or, v128_store, }; use crate::{ColorMatrix, row::scalar}; @@ -500,6 +500,269 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { } } +/// WASM simd128 P010 → packed **8‑bit** RGB. +/// +/// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors +/// [`yuv420p10_to_rgb_row`] with two structural differences: +/// - Samples are shifted right by 6 (`u16x8_shr(_, 6)`) instead of +/// AND‑masked. +/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_wasm`] +/// (two `u8x16_swizzle` + two `i8x16_shuffle` combines). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: simd128 compile‑time availability is the caller's + // obligation. + unsafe { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); + let u_vec = u16x8_shr(u_vec, 6); + let v_vec = u16x8_shr(v_vec, 6); + + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = dup_lo(r_chroma); + let r_dup_hi = dup_hi(r_chroma); + let g_dup_lo = dup_lo(g_chroma); + let g_dup_hi = dup_hi(g_chroma); + let b_dup_lo = dup_lo(b_chroma); + let b_dup_hi = dup_hi(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = i16x8_add_sat(y_scaled_lo, b_dup_lo); + let b_hi = i16x8_add_sat(y_scaled_hi, b_dup_hi); + let g_lo = i16x8_add_sat(y_scaled_lo, g_dup_lo); + let g_hi = i16x8_add_sat(y_scaled_hi, g_dup_hi); + let r_lo = i16x8_add_sat(y_scaled_lo, r_dup_lo); + let r_hi = i16x8_add_sat(y_scaled_hi, r_dup_hi); + + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + + write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// WASM simd128 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed +/// `yuv420p10le` convention). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: simd128 compile‑time availability is the caller's + // obligation. + unsafe { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + let max_v = i16x8_splat(OUT_MAX_10); + let zero_v = i16x8_splat(0); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6); + let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6); + let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); + let u_vec = u16x8_shr(u_vec, 6); + let v_vec = u16x8_shr(v_vec, 6); + + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = dup_lo(r_chroma); + let r_dup_hi = dup_hi(r_chroma); + let g_dup_lo = dup_lo(g_chroma); + let g_dup_hi = dup_hi(g_chroma); + let b_dup_lo = dup_lo(b_chroma); + let b_dup_hi = dup_hi(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let r_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v); + + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_u16_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Deinterleaves 16 `u16` elements at `ptr` into `(u_vec, v_vec)` — +/// two 128‑bit vectors each holding 8 `u16` samples. Wasm's +/// `u8x16_swizzle` is semantically equivalent to SSSE3 +/// `_mm_shuffle_epi8` (indices ≥ 16 zero the lane), so the same +/// split‑mask pattern applies. `i8x16_shuffle` is used for the +/// cross‑vector 64‑bit recombine. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes (16 `u16` +/// elements). Caller must have simd128 enabled at compile time. +#[inline(always)] +unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) { + unsafe { + // Pack evens (U's) into low 8 bytes, odds (V's) into high 8 bytes. + let split_mask = i8x16(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + + let chunk0 = v128_load(ptr.cast()); + let chunk1 = v128_load(ptr.add(8).cast()); + + let s0 = u8x16_swizzle(chunk0, split_mask); + let s1 = u8x16_swizzle(chunk1, split_mask); + + // u_vec = low 8 bytes of s0 + low 8 bytes of s1. + // v_vec = high 8 bytes of s0 + high 8 bytes of s1. + let u_vec = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(s0, s1); + let v_vec = + i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(s0, s1); + (u_vec, v_vec) + } +} + /// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper /// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1434,4 +1697,97 @@ mod tests { check_p10_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false); check_p10_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- P010 simd128 scalar-equivalence -------------------------------- + + fn p010_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6) + .collect() + } + + fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec { + let pairs = u.len(); + debug_assert_eq!(u.len(), v.len()); + let mut out = std::vec::Vec::with_capacity(pairs * 2); + for i in 0..pairs { + out.push(u[i]); + out.push(v[i]); + } + out + } + + fn check_p010_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges"); + } + + fn check_p010_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges"); + } + + #[test] + fn simd128_p010_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u8_simd128_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_p010_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u16_simd128_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_p010_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p010_u8_simd128_equivalence(w, ColorMatrix::Bt601, false); + check_p010_u16_simd128_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn simd128_p010_matches_scalar_1920() { + check_p010_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false); + check_p010_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index d1a1710..e5b6db7 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -43,8 +43,8 @@ use core::arch::x86_64::{ _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16, - _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16, - _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, + _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srli_epi16, + _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, }; use crate::{ @@ -526,6 +526,330 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) } } +/// AVX2 P010 → packed **8‑bit** RGB. +/// +/// Block size 32 Y pixels / 16 chroma pairs per iteration. Mirrors +/// [`yuv420p10_to_rgb_row`] with two structural differences: +/// - Samples are shifted right by 6 (`_mm256_srli_epi16::<6>`) +/// instead of AND‑masked. +/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx2`] +/// (two `_mm256_shuffle_epi8` + two `_mm256_permute4x64_epi64` + +/// two `_mm256_permute2x128_si256` per 32 chroma elements). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: AVX2 availability is the caller's obligation. + unsafe { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 32 <= width { + // 32 Y = two u16×16 loads, shifted right by 6. + let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + + // 32 UV (16 pairs) — deinterleave + shift. + let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); + let u_vec = _mm256_srli_epi16::<6>(u_vec); + let v_vec = _mm256_srli_epi16::<6>(v_vec); + + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = _mm256_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm256_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm256_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm256_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm256_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm256_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = narrow_u8x32(b_lo, b_hi); + let g_u8 = narrow_u8x32(g_lo, g_hi); + let r_u8 = narrow_u8x32(r_lo, r_hi); + + write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 32; + } + + if x < width { + scalar::p010_to_rgb_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// AVX2 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed +/// `yuv420p10le` convention). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: AVX2 availability is the caller's obligation. + unsafe { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let max_v = _mm256_set1_epi16(OUT_MAX_10); + let zero_v = _mm256_set1_epi16(0); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 32 <= width { + let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast())); + let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); + let u_vec = _mm256_srli_epi16::<6>(u_vec); + let v_vec = _mm256_srli_epi16::<6>(v_vec); + + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let r_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + + x += 32; + } + + if x < width { + scalar::p010_to_rgb_u16_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Deinterleaves 32 `u16` elements at `ptr` (`[U0, V0, U1, V1, …, +/// U15, V15]`) into `(u_vec, v_vec)` — two AVX2 vectors each holding +/// 16 packed `u16` samples. +/// +/// Uses per‑lane `_mm256_shuffle_epi8` to pack each 128‑bit lane's +/// U/V samples into the low/high 64 bits, then +/// `_mm256_permute4x64_epi64::<0xD8>` to move the two U halves +/// together (low 128) and the two V halves together (high 128) within +/// each source vector, and finally `_mm256_permute2x128_si256` to +/// combine the four U halves and the four V halves across the two +/// vectors. 2 loads + 2 shuffles + 2 per-vector permutes + 2 cross- +/// vector permutes = 8 ops. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 readable bytes (32 `u16` +/// elements). Caller's `target_feature` must include AVX2. +#[inline(always)] +unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) { + unsafe { + // Per‑lane byte mask: within each 128‑bit lane, pack even u16s + // (U's) into low 8 bytes, odd u16s (V's) into high 8 bytes. + let split_mask = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, // high lane + ); + + let uv0 = _mm256_loadu_si256(ptr.cast()); + let uv1 = _mm256_loadu_si256(ptr.add(16).cast()); + + // After per‑lane shuffle: each vector is + // `[U_lane0_lo, V_lane0_lo, U_lane1_lo, V_lane1_lo]` in 64‑bit + // chunks. + let s0 = _mm256_shuffle_epi8(uv0, split_mask); + let s1 = _mm256_shuffle_epi8(uv1, split_mask); + + // Permute 4×64 within each vector to get [U0..U7, V0..V7] and + // [U8..U15, V8..V15]. Mask 0xD8 = (3,1,2,0) → picks 64-bit + // chunks 0, 2, 1, 3 from the source, rearranging + // [A, B, C, D] → [A, C, B, D]. + let s0_p = _mm256_permute4x64_epi64::<0xD8>(s0); + let s1_p = _mm256_permute4x64_epi64::<0xD8>(s1); + + // Cross-vector permute: low 128 of s0_p + low 128 of s1_p → U's; + // high 128 of s0_p + high 128 of s1_p → V's. + let u_vec = _mm256_permute2x128_si256::<0x20>(s0_p, s1_p); + let v_vec = _mm256_permute2x128_si256::<0x31>(s0_p, s1_p); + (u_vec, v_vec) + } +} + /// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1375,4 +1699,103 @@ mod tests { check_p10_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false); check_p10_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- P010 AVX2 scalar-equivalence ----------------------------------- + + fn p010_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6) + .collect() + } + + fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec { + let pairs = u.len(); + debug_assert_eq!(u.len(), v.len()); + let mut out = std::vec::Vec::with_capacity(pairs * 2); + for i in 0..pairs { + out.push(u[i]); + out.push(v[i]); + } + out + } + + fn check_p010_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges"); + } + + fn check_p010_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges"); + } + + #[test] + fn avx2_p010_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u8_avx2_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_p010_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u16_avx2_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_p010_matches_scalar_odd_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_p010_u8_avx2_equivalence(w, ColorMatrix::Bt601, false); + check_p010_u16_avx2_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn avx2_p010_matches_scalar_1920() { + check_p010_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false); + check_p010_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 0c8f941..19caeaa 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -59,8 +59,8 @@ use core::arch::x86_64::{ _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16, _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64, _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64, - _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_sub_epi16, _mm512_unpackhi_epi16, - _mm512_unpacklo_epi16, + _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srli_epi16, _mm512_sub_epi16, + _mm512_unpackhi_epi16, _mm512_unpacklo_epi16, }; use crate::{ @@ -572,6 +572,317 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u } } +/// AVX‑512 P010 → packed **8‑bit** RGB. +/// +/// Block size 64 Y pixels / 32 chroma pairs per iteration. Mirrors +/// [`yuv420p10_to_rgb_row`] with two structural differences: +/// - Samples are shifted right by 6 (`_mm512_srli_epi16::<6>`) +/// instead of AND‑masked. +/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx512`] +/// — per‑128‑lane shuffle + 64‑bit permute + cross‑vector +/// `_mm512_permutex2var_epi64` to produce 32‑sample U and V +/// vectors. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// +/// # Safety +/// +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: AVX‑512BW availability is the caller's obligation. + unsafe { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + let mut x = 0usize; + while x + 64 <= width { + let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); + let u_vec = _mm512_srli_epi16::<6>(u_vec); + let v_vec = _mm512_srli_epi16::<6>(v_vec); + + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + let b_lo = _mm512_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm512_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm512_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm512_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm512_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm512_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); + let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); + let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); + + write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 64; + } + + if x < width { + scalar::p010_to_rgb_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// AVX‑512 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed +/// `yuv420p10le` convention). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// +/// # Safety +/// +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: AVX‑512BW availability is the caller's obligation. + unsafe { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let max_v = _mm512_set1_epi16(OUT_MAX_10); + let zero_v = _mm512_set1_epi16(0); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + let mut x = 0usize; + while x + 64 <= width { + let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast())); + let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); + let u_vec = _mm512_srli_epi16::<6>(u_vec); + let v_vec = _mm512_srli_epi16::<6>(v_vec); + + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + let r_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + + x += 64; + } + + if x < width { + scalar::p010_to_rgb_u16_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Deinterleaves 64 `u16` elements at `ptr` into `(u_vec, v_vec)` — +/// two AVX‑512 vectors each holding 32 packed `u16` samples. +/// +/// Per‑128‑bit‑lane `_mm512_shuffle_epi8` packs even u16s (U's) into +/// each lane's low 64 bits, odd u16s (V's) into the high 64. Then +/// `_mm512_permutexvar_epi64` with the existing `pack_fixup` index +/// `[0, 2, 4, 6, 1, 3, 5, 7]` rearranges the 64‑bit chunks so each +/// vector becomes `[U0..U15 | V0..V15]`. Finally +/// `_mm512_permutex2var_epi64` combines the two vectors into the +/// full 32‑sample U and V vectors. +/// +/// # Safety +/// +/// `ptr` must point to at least 128 readable bytes (64 `u16` +/// elements). Caller's `target_feature` must include AVX‑512F + +/// AVX‑512BW. +#[inline(always)] +unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) { + unsafe { + // Per‑128‑lane mask (same byte pattern replicated across the 4 + // lanes of a `__m512i`). + let split_mask = _mm512_broadcast_i32x4(_mm_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + )); + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + // Cross-vector 2x8 permute indices: + // u_vec = low 256 of each vec → chunks [0..3 of a, 0..3 of b] + // v_vec = high 256 of each vec → chunks [4..7 of a, 4..7 of b] + let u_perm = _mm512_setr_epi64(0, 1, 2, 3, 8, 9, 10, 11); + let v_perm = _mm512_setr_epi64(4, 5, 6, 7, 12, 13, 14, 15); + + let uv0 = _mm512_loadu_si512(ptr.cast()); + let uv1 = _mm512_loadu_si512(ptr.add(32).cast()); + + let s0 = _mm512_shuffle_epi8(uv0, split_mask); + let s1 = _mm512_shuffle_epi8(uv1, split_mask); + + // After per-lane shuffle + per-vector 64-bit permute, each vector + // is `[U0..U15 | V0..V15]` (low 256 = U's, high 256 = V's). + let s0_p = _mm512_permutexvar_epi64(pack_fixup, s0); + let s1_p = _mm512_permutexvar_epi64(pack_fixup, s1); + + let u_vec = _mm512_permutex2var_epi64(s0_p, u_perm, s1_p); + let v_vec = _mm512_permutex2var_epi64(s0_p, v_perm, s1_p); + (u_vec, v_vec) + } +} + /// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1425,4 +1736,103 @@ mod tests { check_p10_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false); check_p10_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- P010 AVX-512 scalar-equivalence -------------------------------- + + fn p010_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6) + .collect() + } + + fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec { + let pairs = u.len(); + debug_assert_eq!(u.len(), v.len()); + let mut out = std::vec::Vec::with_capacity(pairs * 2); + for i in 0..pairs { + out.push(u[i]); + out.push(v[i]); + } + out + } + + fn check_p010_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges"); + } + + fn check_p010_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges"); + } + + #[test] + fn avx512_p010_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u8_avx512_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_p010_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u16_avx512_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_p010_matches_scalar_odd_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_p010_u8_avx512_equivalence(w, ColorMatrix::Bt601, false); + check_p010_u16_avx512_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn avx512_p010_matches_scalar_1920() { + check_p010_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false); + check_p010_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 297f1fb..66e385b 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -39,7 +39,8 @@ use core::arch::x86_64::{ __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, - _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, _mm_unpacklo_epi16, + _mm_srai_epi32, _mm_srli_epi16, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, + _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64, }; use crate::{ @@ -192,6 +193,271 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } +/// SSE4.1 P010 → packed **8‑bit** RGB. +/// +/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences +/// from [`yuv420p10_to_rgb_row`]: +/// - Samples are shifted right by 6 (`_mm_srli_epi16::<6>`) instead +/// of AND‑masked — P010's 10 active bits live in the HIGH 10 of +/// each `u16`. +/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16`] +/// below (one `_mm_shuffle_epi8` + two 64‑bit unpacks per 16 +/// chroma elements). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_row`]. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: SSE4.1 availability is the caller's obligation. + unsafe { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // Y: two u16×8 loads, each shifted right by 6. + let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + + // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then + // deinterleave into separate u_vec + v_vec. + let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); + let u_vec = _mm_srli_epi16::<6>(u_vec); + let v_vec = _mm_srli_epi16::<6>(v_vec); + + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma); + let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma); + let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma); + let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = _mm_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = _mm_packus_epi16(b_lo, b_hi); + let g_u8 = _mm_packus_epi16(g_lo, g_hi); + let r_u8 = _mm_packus_epi16(r_lo, r_hi); + + write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// SSE4.1 P010 → packed **10‑bit `u16`** RGB (native‑depth, +/// low‑bit‑packed — `yuv420p10le` convention). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`]. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(uv_half.len() >= width); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: SSE4.1 availability is the caller's obligation. + unsafe { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let max_v = _mm_set1_epi16(OUT_MAX_10); + let zero_v = _mm_set1_epi16(0); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast())); + let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast())); + let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); + let u_vec = _mm_srli_epi16::<6>(u_vec); + let v_vec = _mm_srli_epi16::<6>(v_vec); + + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma); + let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma); + let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma); + let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let r_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + + x += 16; + } + + if x < width { + scalar::p010_to_rgb_u16_row( + &y[x..width], + &uv_half[x..width], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Deinterleaves 16 `u16` elements at `ptr` (`[U0, V0, U1, V1, …, +/// U7, V7]`) into `(u_vec, v_vec)` where each vector holds 8 packed +/// `u16` samples. +/// +/// Each of the two 128‑bit loads is byte‑shuffled via +/// `_mm_shuffle_epi8` so that U samples land in the low 64 bits and +/// V samples in the high 64. Then `_mm_unpacklo_epi64` / +/// `_mm_unpackhi_epi64` combine the two halves into full u16×8 +/// vectors. 2 loads + 2 shuffles + 2 unpacks = 6 ops. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes (16 `u16` +/// elements). Caller's `target_feature` must include SSSE3 (via +/// SSE4.1 or a superset). +#[inline(always)] +unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) { + unsafe { + // Per‑chunk mask: pack even u16s (U's) into low 8 bytes, odd u16s + // (V's) into high 8 bytes. + let split_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let chunk0 = _mm_loadu_si128(ptr.cast()); + let chunk1 = _mm_loadu_si128(ptr.add(8).cast()); + let s0 = _mm_shuffle_epi8(chunk0, split_mask); + let s1 = _mm_shuffle_epi8(chunk1, split_mask); + let u_vec = _mm_unpacklo_epi64(s0, s1); + let v_vec = _mm_unpackhi_epi64(s0, s1); + (u_vec, v_vec) + } +} + /// SSE4.1 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1212,4 +1478,103 @@ mod tests { check_p10_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false); check_p10_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } + + // ---- P010 SSE4.1 scalar-equivalence ---------------------------------- + + fn p010_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6) + .collect() + } + + fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec { + let pairs = u.len(); + debug_assert_eq!(u.len(), v.len()); + let mut out = std::vec::Vec::with_capacity(pairs * 2); + for i in 0..pairs { + out.push(u[i]); + out.push(v[i]); + } + out + } + + fn check_p010_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges"); + } + + fn check_p010_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p010_plane(width, 37); + let u = p010_plane(width / 2, 53); + let v = p010_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + unsafe { + p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + } + assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges"); + } + + #[test] + fn sse41_p010_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u8_sse41_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_p010_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p010_u16_sse41_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_p010_matches_scalar_odd_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p010_u8_sse41_equivalence(w, ColorMatrix::Bt601, false); + check_p010_u16_sse41_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn sse41_p010_matches_scalar_1920() { + check_p010_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false); + check_p010_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/mod.rs b/src/row/mod.rs index 788789b..80afab7 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -498,6 +498,158 @@ pub fn yuv420p10_to_rgb_u16_row( scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); } +/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ +/// packed — 10 active bits in the high 10 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// This is the HDR hardware‑decode keystone format: VideoToolbox, +/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit +/// output. See `scalar::p010_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P010** to **native‑depth `u16`** packed RGB +/// (10 active bits in the **low** 10 of each output `u16`, matching +/// `yuv420p10le` convention — **not** the P010 high‑bit packing). +/// Callers feeding this output into a P010 consumer must shift left +/// by 6. +/// +/// See `scalar::p010_to_rgb_u16_row` for the full spec. +/// `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p010_to_rgb_u16_row( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); +} + /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. /// diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 9c6afa4..527ea4d 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -344,6 +344,135 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( } } +// ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------ + +/// Converts one row of P010 (semi‑planar 4:2:0 with UV interleaved, +/// 10 active bits in the **high** 10 of each `u16`) to **8‑bit** +/// packed RGB. +/// +/// Structurally identical to [`nv12_to_rgb_row`] plus the per‑sample +/// shift: each `u16` load is extracted to its 10‑bit value via +/// `sample >> 6`, then the same Q15 pipeline as +/// [`yuv_420p_n_to_rgb_row`] runs with `BITS == 10`. Mispacked input +/// — e.g. a `yuv420p10le` buffer with values in the **low** 10 bits +/// — is masked down to a small positive number (producing near‑black +/// output) rather than silent garbage, matching every SIMD backend. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0, "P010 requires even width"); + debug_assert!(y.len() >= width, "y row too short"); + debug_assert!(uv_half.len() >= width, "uv row too short"); + debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + + let coeffs = Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range); + let bias = chroma_bias::<10>(); + + // Each `u16` load is converted to its 10-bit sample with `>> 6`, + // extracting the upper 10 bits and leaving the result in + // `[0, 1023]`. If low-packed input (`yuv420p10le`) is handed to + // this kernel by mistake, that shift discards the active low 6 bits + // rather than recovering the intended 10-bit value. No hot-path + // cost: one shift per load. + let mut x = 0; + while x < width { + let c_idx = x / 2; + let u_sample = uv_half[c_idx * 2] >> 6; + let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_d = q15_scale(u_sample as i32 - bias, c_scale); + let v_d = q15_scale(v_sample as i32 - bias, c_scale); + + let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); + let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); + let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); + + let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + rgb_out[x * 3] = clamp_u8(y0 + r_chroma); + rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); + rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + + let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); + rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); + rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + + x += 2; + } +} + +/// Converts one row of P010 to **native‑depth `u16`** packed RGB +/// (10 active bits in the low bits of each `u16`, matching +/// `yuv420p10le` convention — **not** P010's high‑bit packing). +/// +/// Mirrors [`yuv_420p_n_to_rgb_u16_row::<10>`] on the math side; the +/// only difference is the input shift (`sample >> 6` instead of +/// `sample & 0x3FF`) and the UV deinterleave. Output is suitable for +/// direct consumption by downstream `yuv420p10le`‑shaped tooling. If +/// you need P010‑packed RGB output, shift left by 6 on the caller. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `uv_half.len() >= width`, +/// `rgb_out.len() >= 3 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0, "P010 requires even width"); + debug_assert!(y.len() >= width, "y row too short"); + debug_assert!(uv_half.len() >= width, "uv row too short"); + debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + + let coeffs = Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = range_params_n::<10, 10>(full_range); + let bias = chroma_bias::<10>(); + let out_max: i32 = (1i32 << 10) - 1; + + let mut x = 0; + while x < width { + let c_idx = x / 2; + let u_sample = uv_half[c_idx * 2] >> 6; + let v_sample = uv_half[c_idx * 2 + 1] >> 6; + let u_d = q15_scale(u_sample as i32 - bias, c_scale); + let v_d = q15_scale(v_sample as i32 - bias, c_scale); + + let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); + let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); + let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); + + let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale); + rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; + rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + + let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale); + rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; + rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + + x += 2; + } +} + /// Compile‑time sample mask for `BITS`: `(1 << BITS) - 1` as `u16`. /// Returns `0x03FF` for 10‑bit, `0x0FFF` for 12‑bit, `0x3FFF` for /// 14‑bit. SIMD backends splat this into a vector constant and AND @@ -990,4 +1119,112 @@ mod tests { "matrices should materially differ: {bt709:?} vs {ycgco:?}" ); } + + // ---- p010_to_rgb_row (P010 → u8) --------------------------------------- + // + // P010 samples: 10 active bits in the HIGH 10 of each u16. + // White Y = 1023 << 6 = 0xFFC0, neutral UV = 512 << 6 = 0x8000. + + #[test] + fn p010_rgb_black_full_range() { + // Y = 0, neutral UV → black. + let y = [0u16; 4]; + let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1 + let mut rgb = [0u8; 12]; + p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); + } + + #[test] + fn p010_rgb_white_full_range() { + // Y = 0xFFC0 = 1023 << 6, neutral UV → white. + let y = [0xFFC0u16; 4]; + let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; + let mut rgb = [0u8; 12]; + p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); + } + + #[test] + fn p010_rgb_gray_is_gray() { + // 10-bit mid-gray Y=512 → P010 Y = 512 << 6 = 0x8000. + let y = [0x8000u16; 4]; + let uv = [0x8000u16; 4]; + let mut rgb = [0u8; 12]; + p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(128) <= 1, "got {r}"); + } + } + + #[test] + fn p010_rgb_limited_range_endpoints() { + // 10-bit limited black Y=64 → P010 = 64 << 6 = 0x1000. + // 10-bit limited white Y=940 → P010 = 940 << 6 = 0xEB00. + let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00]; + let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; + let mut rgb = [0u8; 12]; + p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); + assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); + assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); + assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); + assert_eq!((rgb[9], rgb[10], rgb[11]), (255, 255, 255)); + } + + #[test] + fn p010_matches_yuv420p10_when_shifted() { + // Handing the same logical samples to P010 (high-packed) and + // yuv420p10 (low-packed) must produce the same RGB output. + let y_p10 = [200u16, 800, 500, 700]; // 10-bit values + let u_p10 = [600u16, 400]; // 10-bit values + let v_p10 = [300u16, 900]; // 10-bit values + + let y_p010: [u16; 4] = core::array::from_fn(|i| y_p10[i] << 6); + let uv_p010: [u16; 4] = [u_p10[0] << 6, v_p10[0] << 6, u_p10[1] << 6, v_p10[1] << 6]; + + let mut rgb_p10 = [0u8; 12]; + let mut rgb_p010 = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>( + &y_p10, + &u_p10, + &v_p10, + &mut rgb_p10, + 4, + ColorMatrix::Bt709, + true, + ); + p010_to_rgb_row( + &y_p010, + &uv_p010, + &mut rgb_p010, + 4, + ColorMatrix::Bt709, + true, + ); + assert_eq!(rgb_p10, rgb_p010); + } + + // ---- p010_to_rgb_u16_row (P010 → native-depth u16) -------------------- + + #[test] + fn p010_rgb_u16_white_full_range() { + let y = [0xFFC0u16; 4]; + let uv = [0x8000u16; 4]; + let mut rgb = [0u16; 12]; + p010_to_rgb_u16_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); + } + + #[test] + fn p010_rgb_u16_limited_range_endpoints() { + let y = [0x1000u16, 0xEB00]; + let uv = [0x8000u16, 0x8000]; + let mut rgb = [0u16; 6]; + p010_to_rgb_u16_row(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); + assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); + assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); + } } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 432994e..92835a5 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -19,12 +19,12 @@ use thiserror::Error; use crate::{ HsvBuffers, PixelSink, SourceFormat, row::{ - nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row, - yuv420p10_to_rgb_u16_row, + nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, rgb_to_hsv_row, + yuv_420_to_rgb_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, }, yuv::{ - Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420p10, Yuv420p10Row, - Yuv420p10Sink, Yuv420pRow, Yuv420pSink, + Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, Yuv420p, Yuv420p10, + Yuv420p10Row, Yuv420p10Sink, Yuv420pRow, Yuv420pSink, }, }; @@ -219,6 +219,12 @@ pub enum RowSlice { /// `width / 2` elements. #[display("V Half 10")] VHalf10, + /// Half‑width interleaved UV row of a **10‑bit semi‑planar** source + /// ([`P010`]). `u16` samples, `width` elements laid out as + /// `U0, V0, U1, V1, …` (high‑bit‑packed: each element's 10 active + /// bits sit in the high 10 of its `u16`). + #[display("UV Half 10")] + UvHalf10, } /// A sink that writes any subset of `{RGB, Luma, HSV}` into @@ -1108,6 +1114,189 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> { } } +// ---- P010 impl --------------------------------------------------------- + +impl<'a> MixedSinker<'a, P010> { + /// Attaches a packed **`u16`** RGB output buffer. Mirrors + /// [`MixedSinker::with_rgb_u16`] — compile‑time gated to + /// sinkers whose source format populates native‑depth RGB. + /// + /// Length is measured in `u16` **elements** (not bytes): minimum + /// `width × height × 3`. Output is **low‑bit‑packed** (10‑bit + /// values in the low 10 of each `u16`, upper 6 zero) — matches + /// FFmpeg `yuv420p10le` convention. This is **not** P010 packing + /// (which puts the 10 bits in the high 10); callers feeding a P010 + /// consumer must shift the output left by 6. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). The + /// required length is measured in `u16` **elements**, not bytes. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl P010Sink for MixedSinker<'_, P010> {} + +impl PixelSink for MixedSinker<'_, P010> { + type Input<'r> = P010Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error> { + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y10, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + // Semi-planar UV: `width` u16 elements total (`width / 2` pairs). + if row.uv_half().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UvHalf10, + row: idx, + expected: w, + actual: row.uv_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + // Luma: P010 samples are high-bit-packed (`value << 6`). Taking + // the high byte via `>> 8` gives the top 8 bits of the 10-bit + // value — functionally equivalent to + // `(value >> 2)` for the yuv420p10 path. + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> 8) as u8; + } + } + + // `u16` RGB output — low-bit-packed 10-bit values (yuv420p10le + // convention), not P010's high-bit packing. + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + p010_to_rgb_u16_row( + row.y(), + row.uv_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + p010_to_rgb_row( + row.y(), + row.uv_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + /// Returns `Ok(())` iff the walker's frame dimensions exactly match /// the sinker's configured dimensions. Called from /// [`PixelSink::begin_frame`] on both `MixedSinker` and @@ -1145,8 +1334,8 @@ mod tests { use super::*; use crate::{ ColorMatrix, - frame::{Nv12Frame, Nv21Frame, Yuv420p10Frame, Yuv420pFrame}, - yuv::{nv12_to, nv21_to, yuv420p_to, yuv420p10_to}, + frame::{Nv12Frame, Nv21Frame, P010Frame, Yuv420p10Frame, Yuv420pFrame}, + yuv::{nv12_to, nv21_to, p010_to, yuv420p_to, yuv420p10_to}, }; fn solid_yuv420p_frame( @@ -2170,4 +2359,176 @@ mod tests { assert_eq!(rgb_scalar, rgb_simd); assert_eq!(rgb_u16_scalar, rgb_u16_simd); } + + // ---- P010 -------------------------------------------------------------- + // + // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each + // u16). Mirrors the Yuv420p10 test shape but with UV interleaved. + + fn solid_p010_frame( + width: u32, + height: u32, + y_10bit: u16, + u_10bit: u16, + v_10bit: u16, + ) -> (Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + // Shift into the high 10 bits (P010 packing). + let y = std::vec![y_10bit << 6; w * h]; + let uv: Vec = (0..cw * ch) + .flat_map(|_| [u_10bit << 6, v_10bit << 6]) + .collect(); + (y, uv) + } + + #[test] + fn p010_rgb_u8_only_gray_is_gray() { + // 10-bit mid-gray Y=512, UV=512 → ~128 u8 RGB across the frame. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn p010_rgb_u16_only_native_depth_gray() { + // Output u16 is yuv420p10le-packed (10-bit in low 10) even though + // the input is P010-packed. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert!( + px[0] <= 1023, + "output must stay within 10-bit low-packed range" + ); + } + } + + #[test] + fn p010_rgb_u8_and_u16_both_populated() { + // 10-bit full-range white: Y=1023, UV=512. Both buffers fill in + // one call. + let (yp, uvp) = solid_p010_frame(16, 8, 1023, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 1023)); + } + + #[test] + fn p010_luma_downshifts_to_8bit() { + // Y=512 at 10 bits, P010-packed (0x8000). After >> 8, the 8-bit + // luma is 0x80 = 128. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn p010_matches_yuv420p10_mixed_sinker_with_shifted_samples() { + // Logical equivalence: same samples fed through the two formats + // (low-packed as yuv420p10, high-packed as P010) must produce + // byte-identical u8 RGB. + let w = 16u32; + let h = 8u32; + let y = 600u16; + let u = 400u16; + let v = 700u16; + + let (yp_p10, up_p10, vp_p10) = solid_yuv420p10_frame(w, h, y, u, v); + let src_p10 = Yuv420p10Frame::new(&yp_p10, &up_p10, &vp_p10, w, h, w, w / 2, w / 2); + + let (yp_p010, uvp_p010) = solid_p010_frame(w, h, y, u, v); + let src_p010 = P010Frame::new(&yp_p010, &uvp_p010, w, h, w, w); + + let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize]; + let mut rgb_p010 = std::vec![0u8; (w * h * 3) as usize]; + let mut s_yuv = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_yuv) + .unwrap(); + let mut s_p010 = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_p010) + .unwrap(); + yuv420p10_to(&src_p10, true, ColorMatrix::Bt709, &mut s_yuv).unwrap(); + p010_to(&src_p010, true, ColorMatrix::Bt709, &mut s_p010).unwrap(); + assert_eq!(rgb_yuv, rgb_p010); + } + + #[test] + fn p010_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); + } + + #[test] + fn p010_with_simd_false_matches_with_simd_true() { + // Stubs delegate to scalar so simd=true and simd=false produce + // byte-identical output for now. Real SIMD backends will replace + // the stubs — equivalence is preserved by design. + let (yp, uvp) = solid_p010_frame(64, 16, 600, 400, 700); + let src = P010Frame::new(&yp, &uvp, 64, 16, 64, 64); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + p010_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + p010_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } } diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs index 655b706..eedc2ab 100644 --- a/src/yuv/mod.rs +++ b/src/yuv/mod.rs @@ -10,15 +10,20 @@ //! chroma (Android MediaCodec default). //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits //! per sample (HDR10 / 10‑bit SDR software decode). +//! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per +//! sample, high‑bit‑packed (HDR hardware decode: VideoToolbox, +//! VA‑API, NVDEC, D3D11VA, Intel QSV). //! //! Other families land in follow-up commits. mod nv12; mod nv21; +mod p010; mod yuv420p; mod yuv420p10; pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to}; pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to}; +pub use p010::{P010, P010Row, P010Sink, p010_to}; pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to}; pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to}; diff --git a/src/yuv/p010.rs b/src/yuv/p010.rs new file mode 100644 index 0000000..744c6ce --- /dev/null +++ b/src/yuv/p010.rs @@ -0,0 +1,150 @@ +//! P010 — semi‑planar 4:2:0, 10‑bit, high‑bit‑packed +//! (`AV_PIX_FMT_P010LE`). +//! +//! Storage is a 2‑plane layout: one full‑size Y plane plus one +//! interleaved UV plane at half width and half height. Sample width +//! is `u16` with the 10 active bits in the **high** 10 positions of +//! each element (`sample = value << 6`), low 6 bits zero. This is +//! Microsoft's P010 convention and what every HDR hardware decoder +//! emits — Apple VideoToolbox, VA‑API, NVDEC, D3D11VA, Intel QSV. +//! +//! Conversion semantics mirror [`super::Nv12`] on the layout side and +//! [`super::Yuv420p10`] on the Q‑math side: two consecutive Y rows +//! share one UV row (4:2:0), chroma is nearest‑neighbor upsampled in +//! registers inside the row primitive, and every SIMD backend shifts +//! each `u16` load right by 6 to extract the 10‑bit value before +//! running the same Q15 pipeline used by [`super::Yuv420p10`]. + +use crate::{ColorMatrix, PixelSink, SourceFormat, frame::P010Frame, sealed::Sealed}; + +/// Zero‑sized marker for the P010 source format. Used as the `F` type +/// parameter on [`crate::sinker::MixedSinker`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct P010; + +impl Sealed for P010 {} +impl SourceFormat for P010 {} + +/// One output row of a P010 source handed to a [`P010Sink`]. +/// +/// Accessors: +/// - [`y`](Self::y) — full‑width Y row (`width` `u16` samples, high‑ +/// bit‑packed). +/// - [`uv_half`](Self::uv_half) — **interleaved, half‑width** UV row +/// (`width` `u16` elements = `width / 2` U/V pairs, U first). The +/// row primitive deinterleaves and upsamples in‑register. +/// - [`row`](Self::row) — output row index (`0 ..= frame.height() - 1`). +/// - [`matrix`](Self::matrix), [`full_range`](Self::full_range) — +/// carried through from the kernel call. +#[derive(Debug, Clone, Copy)] +pub struct P010Row<'a> { + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> P010Row<'a> { + /// Bundles one row of a P010 source for a [`P010Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + pub(crate) fn new( + y: &'a [u16], + uv_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + uv_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples, high‑bit‑packed + /// (10 active bits in the high 10 of each element). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Interleaved UV row — `width` `u16` elements laid out as + /// `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. Each element is + /// high‑bit‑packed. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn uv_half(&self) -> &'a [u16] { + self.uv_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 1023]` for 10‑bit, + /// scaled into the high 10 bits of each `u16`); `false` for limited + /// range. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume P010 rows. +/// +/// A subtrait of [`PixelSink`] that pins the row shape to +/// [`P010Row`]. Implementors get +/// `process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error>` +/// via the supertrait. +pub trait P010Sink: for<'a> PixelSink = P010Row<'a>> {} + +/// Converts a P010 frame by walking its rows and feeding each one to +/// the [`P010Sink`]. +/// +/// The kernel is a pure row walker — no color arithmetic happens +/// here. Slice math picks the Y row and the correct UV row for each +/// output row (`chroma_row = row / 2` for 4:2:0) and hands borrows to +/// the Sink. The Sink decides what to derive and where to write. +pub fn p010_to( + src: &P010Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let uv_stride = src.uv_stride() as usize; + // UV row payload is `width` `u16` elements — `width / 2` interleaved + // U/V pairs. + let uv_row_elems = w; + + let y_plane = src.y(); + let uv_plane = src.uv(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + // 4:2:0 chroma subsampling: two consecutive Y rows share one UV + // row. + let chroma_row = row / 2; + let uv_start = chroma_row * uv_stride; + let uv_half = &uv_plane[uv_start..uv_start + uv_row_elems]; + + sink.process(P010Row::new(y, uv_half, row, matrix, full_range))?; + } + Ok(()) +}