diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0120375..d0fc00b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -200,6 +200,52 @@ jobs: export CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="$SDE_PATH/sde64 -icx --" cargo test --all-features + # Run the wasm32 simd128 lib tests under wasmtime. The `cross` job + # above only builds for wasm targets; without this job the + # `wasm_simd128` backend's handcrafted swizzles / clamps / u16 + # stores were dispatchable in production (under + # `-C target-feature=+simd128`) but never runtime‑verified. This job + # runs every scalar‑equivalence test — including the new yuv420p10 + # u8 / u16 output paths and the adversarial out‑of‑range regressions + # — against an actual wasm runtime. + # + # `wasm32-wasip1` is the wasi preview‑1 target (libstd + file/env + # APIs that the test harness needs). Criterion is gated out of the + # wasm dev‑deps in Cargo.toml because rayon doesn't build for wasi. + test-wasm-simd128: + name: test-wasm-simd128 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Cache cargo build and registry + uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-test-wasm-simd128-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-test-wasm-simd128- + - name: Install Rust + wasm32-wasip1 + run: | + rustup update stable --no-self-update + rustup default stable + rustup target add wasm32-wasip1 + - name: Install wasmtime + run: | + curl https://wasmtime.dev/install.sh -sSf | bash + echo "$HOME/.wasmtime/bin" >> "$GITHUB_PATH" + - name: Run lib tests under wasmtime (simd128) + env: + # `cargo test` hands the compiled `.wasm` test binary as the + # first positional arg after `--`; wasmtime's `run --` + # interprets that as the module path. We don't need filesystem + # or env access — the tests are pure compute. + CARGO_TARGET_WASM32_WASIP1_RUNNER: wasmtime run -- + RUSTFLAGS: -C target-feature=+simd128 + run: cargo test --lib --target wasm32-wasip1 + sanitizer: name: sanitizer runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index b13c252..2f34a10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,10 @@ harness = false name = "nv21_to_rgb" harness = false +[[bench]] +name = "yuv_420p10_to_rgb" +harness = false + [[bench]] name = "rgb_to_hsv" harness = false @@ -43,9 +47,15 @@ thiserror = { version = "2", default-features = false } libm = { version = "0.2", optional = true } [dev-dependencies] -criterion = "0.8" tempfile = "3" +# Criterion pulls in rayon, which doesn't build for the wasm32‑wasi* +# targets we use to run the simd128 backends under wasmtime. Gate it +# to non‑wasm hosts — benches never run on wasm anyway (they need +# system threading / timing that the wasi runner doesn't expose). +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies] +criterion = "0.8" + [profile.bench] opt-level = 3 debug = false diff --git a/benches/yuv_420p10_to_rgb.rs b/benches/yuv_420p10_to_rgb.rs new file mode 100644 index 0000000..3659f65 --- /dev/null +++ b/benches/yuv_420p10_to_rgb.rs @@ -0,0 +1,108 @@ +//! Per‑row YUV 4:2:0 10‑bit → packed RGB throughput baseline. +//! +//! Three variants per width: +//! - `u8_simd` / `u8_scalar` — native‑SIMD vs scalar on the u8 output +//! path (analogous to the 8‑bit bench). +//! - `u16_simd` / `u16_scalar` — same pair for the native‑depth u16 +//! output path. The u16 path writes 2× the bytes so the MB/s +//! figure is comparable only within the u16 column. + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use std::hint::black_box; + +use colconv::{ + ColorMatrix, + row::{yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row}, +}; + +/// Fills a `u16` buffer with a deterministic 10‑bit pseudo‑random +/// sequence — values occupy the low 10 bits of each `u16`, matching +/// the storage layout of `yuv420p10le`. +fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = ((state >> 8) & 0x3FF) as u16; + } +} + +fn bench(c: &mut Criterion) { + // 720p / 1080p / 4K — multiples of 64 so the widest backend + // (AVX‑512, 64 pixels per iteration) covers each fully without tail + // work. Avoids skewing comparisons across targets. + const WIDTHS: &[usize] = &[1280, 1920, 3840]; + const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl; + const FULL_RANGE: bool = false; + + // ---- u8 output ------------------------------------------------------ + let mut group_u8 = c.benchmark_group("yuv420p10_to_rgb_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u8; w * 3]; + + group_u8.throughput(Throughput::Bytes((w * 3) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u8_simd" } else { "u8_scalar" }; + group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p10_to_rgb_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u8.finish(); + + // ---- u16 native-depth output ---------------------------------------- + let mut group_u16 = c.benchmark_group("yuv420p10_to_rgb_u16_row"); + + for &w in WIDTHS { + let mut y = std::vec![0u16; w]; + let mut u = std::vec![0u16; w / 2]; + let mut v = std::vec![0u16; w / 2]; + fill_pseudo_random_u16(&mut y, 0x1111); + fill_pseudo_random_u16(&mut u, 0x2222); + fill_pseudo_random_u16(&mut v, 0x3333); + let mut rgb = std::vec![0u16; w * 3]; + + // u16 output writes 2× the bytes of u8. + group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64)); + + for use_simd in [false, true] { + let label = if use_simd { "u16_simd" } else { "u16_scalar" }; + group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| { + b.iter(|| { + yuv420p10_to_rgb_u16_row( + black_box(&y), + black_box(&u), + black_box(&v), + black_box(&mut rgb), + w, + MATRIX, + FULL_RANGE, + use_simd, + ); + }); + }); + } + } + group_u16.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/src/frame.rs b/src/frame.rs index b5e96f6..585d59f 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -5,7 +5,7 @@ //! validates strides vs. widths and that each plane covers its //! declared area. -use derive_more::IsVariant; +use derive_more::{Display, IsVariant}; use thiserror::Error; /// A validated YUV 4:2:0 planar frame. @@ -686,6 +686,490 @@ pub enum Nv21FrameError { }, } +/// A validated YUV 4:2:0 planar frame at bit depths > 8 (10/12/14). +/// +/// Structurally identical to [`Yuv420pFrame`] — three planes, half‑ +/// size chroma — but sample storage is **`u16`** so every pixel +/// carries up to 16 bits of payload. `BITS` is the active bit depth +/// (10, 12, or 14). Callers are **expected** to store each sample in +/// the **low** `BITS` bits of its `u16` (upper `16 - BITS` bits zero), +/// matching FFmpeg's little‑endian `yuv420p10le` / `yuv420p12le` / +/// `yuv420p14le` convention, where each plane is a byte buffer +/// reinterpretable as `u16` little‑endian. `try_new` validates plane +/// geometry / strides / lengths but does **not** inspect sample +/// values to verify this packing. +/// +/// This is **not** the FFmpeg `p010` layout — `p010` stores samples +/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers +/// holding a p010 buffer must shift right by `16 - BITS` before +/// construction. +/// +/// # Input sample range +/// +/// The kernels assume every input sample is in `[0, (1 << BITS) - 1]` +/// — i.e., upper `16 - BITS` bits zero. Validating this at +/// construction would require scanning every sample of every plane +/// (megabytes per frame at video rates); instead the constructor +/// validates geometry only and the contract falls on the caller. +/// Decoders and FFmpeg output satisfy this by construction. +/// +/// **Output for out‑of‑range samples is equivalent to pre‑masking +/// every sample to the low `BITS` bits.** Every kernel (scalar + all +/// 5 SIMD tiers) AND‑masks each `u16` load to `(1 << BITS) - 1` +/// before the Q15 path, so a sample like `0xFFC0` (p010 white = +/// `1023 << 6`) is treated identically to `0x03C0` on every backend +/// when `BITS == 10`. This gives deterministic, backend‑independent +/// output for mispacked input — feeding `p010` data into a +/// `yuv420p10le`‑shaped frame produces severely distorted, but stable, +/// pixel values across scalar / NEON / SSE4.1 / AVX2 / AVX‑512 / +/// wasm simd128, which is an obvious signal for downstream diffing. +/// The mask is a single AND per load and a no‑op on valid input +/// (upper bits already zero). +/// +/// Callers who want the mispacking to surface as a loud error +/// instead of silent color corruption should use +/// [`Self::try_new_checked`] — it scans every sample and returns +/// [`Yuv420pFrame16Error::SampleOutOfRange`] on the first violation. +/// +/// colconv v0.2 ships `BITS == 10` only (the use‑case keystone for +/// HDR and 10‑bit SDR). 12 and 14 are mechanical follow‑ups that +/// just relax the constructor's `BITS` check and add tiered aliases +/// — the kernel math (Q15 coefficients + i32 intermediates) works +/// unchanged across all three, derived at compile time from `BITS`. +/// +/// 16‑bit input (which would overflow the i32 chroma sum in the +/// Q15 path) is **not** represented by this type — it needs a +/// separate kernel family with i64 intermediates or a lower Q +/// coefficient format. That lands in a later ship. +/// +/// Stride is in **samples** (`u16` elements), not bytes. Users +/// holding a byte buffer from FFmpeg should cast via +/// [`bytemuck::cast_slice`] and divide `linesize[i]` by 2 before +/// constructing. +/// +/// `width` must be even (same 4:2:0 rationale as [`Yuv420pFrame`]); +/// `height` may be odd and is handled via `height.div_ceil(2)` in +/// chroma‑row sizing. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420pFrame16<'a, const BITS: u32> { + y: &'a [u16], + u: &'a [u16], + v: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + u_stride: u32, + v_stride: u32, +} + +impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { + /// Constructs a new [`Yuv420pFrame16`], validating dimensions, plane + /// lengths, and the `BITS` parameter. + /// + /// Returns [`Yuv420pFrame16Error`] if any of: + /// - `BITS` is not 10, 12, or 14 (colconv v0.2 additionally rejects + /// 12/14 at the type alias layer — see [`Yuv420p10Frame`]), + /// - `width` or `height` is zero, + /// - `width` is odd, + /// - any stride is smaller than the plane's declared pixel width, + /// - any plane is too short to cover its declared rows, or + /// - `stride * rows` overflows `usize` (32‑bit targets only). + /// + /// All strides are in **samples** (`u16` elements). + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub const fn try_new( + y: &'a [u16], + u: &'a [u16], + v: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + u_stride: u32, + v_stride: u32, + ) -> Result { + // Guard the `BITS` parameter at the top so users who accidentally + // monomorphize on e.g. `BITS == 8` (which would work numerically + // but should go through [`Yuv420pFrame`] instead) or `BITS == 16` + // (which would overflow the i32 chroma sum in the Q15 kernel) + // get a clear error rather than silently wrong output. + if BITS != 10 && BITS != 12 && BITS != 14 { + return Err(Yuv420pFrame16Error::UnsupportedBits { bits: BITS }); + } + if width == 0 || height == 0 { + return Err(Yuv420pFrame16Error::ZeroDimension { width, height }); + } + if width & 1 != 0 { + return Err(Yuv420pFrame16Error::OddWidth { width }); + } + if y_stride < width { + return Err(Yuv420pFrame16Error::YStrideTooSmall { width, y_stride }); + } + let chroma_width = width.div_ceil(2); + if u_stride < chroma_width { + return Err(Yuv420pFrame16Error::UStrideTooSmall { + chroma_width, + u_stride, + }); + } + if v_stride < chroma_width { + return Err(Yuv420pFrame16Error::VStrideTooSmall { + chroma_width, + v_stride, + }); + } + + // Plane sizes are in `u16` elements, so the overflow guard runs + // against the sample count — callers converting from byte strides + // should have already divided by 2. + let y_min = match (y_stride as usize).checked_mul(height as usize) { + Some(v) => v, + None => { + return Err(Yuv420pFrame16Error::GeometryOverflow { + stride: y_stride, + rows: height, + }); + } + }; + if y.len() < y_min { + return Err(Yuv420pFrame16Error::YPlaneTooShort { + expected: y_min, + actual: y.len(), + }); + } + let chroma_height = height.div_ceil(2); + let u_min = match (u_stride as usize).checked_mul(chroma_height as usize) { + Some(v) => v, + None => { + return Err(Yuv420pFrame16Error::GeometryOverflow { + stride: u_stride, + rows: chroma_height, + }); + } + }; + if u.len() < u_min { + return Err(Yuv420pFrame16Error::UPlaneTooShort { + expected: u_min, + actual: u.len(), + }); + } + let v_min = match (v_stride as usize).checked_mul(chroma_height as usize) { + Some(v) => v, + None => { + return Err(Yuv420pFrame16Error::GeometryOverflow { + stride: v_stride, + rows: chroma_height, + }); + } + }; + if v.len() < v_min { + return Err(Yuv420pFrame16Error::VPlaneTooShort { + expected: v_min, + actual: v.len(), + }); + } + + Ok(Self { + y, + u, + v, + width, + height, + y_stride, + u_stride, + v_stride, + }) + } + + /// Constructs a new [`Yuv420pFrame16`], panicking on invalid inputs. + /// Prefer [`Self::try_new`] when inputs may be invalid at runtime. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub const fn new( + y: &'a [u16], + u: &'a [u16], + v: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + u_stride: u32, + v_stride: u32, + ) -> Self { + match Self::try_new(y, u, v, width, height, y_stride, u_stride, v_stride) { + Ok(frame) => frame, + Err(_) => panic!("invalid Yuv420pFrame16 dimensions or plane lengths"), + } + } + + /// Like [`Self::try_new`] but additionally scans every sample of + /// every plane and rejects values above `(1 << BITS) - 1`. Use this + /// on untrusted input (e.g., a `u16` buffer of unknown provenance + /// that might be `p010`‑packed or otherwise dirty) where accepting + /// out-of-range samples would be unacceptable because they violate + /// the expected bit-depth contract and can produce invalid results. + /// + /// Cost: one O(plane_size) linear scan per plane — a few megabytes + /// per 1080p frame at 10 bits. The default [`Self::try_new`] skips + /// this so the hot path (decoder output, already-conforming + /// buffers) stays O(1). + /// + /// Returns [`Yuv420pFrame16Error::SampleOutOfRange`] on the first + /// offending sample — the error carries the plane, element index + /// within that plane's slice, offending value, and the valid + /// maximum so the caller can pinpoint the bad sample. All of + /// [`Self::try_new`]'s geometry errors are still possible. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub fn try_new_checked( + y: &'a [u16], + u: &'a [u16], + v: &'a [u16], + width: u32, + height: u32, + y_stride: u32, + u_stride: u32, + v_stride: u32, + ) -> Result { + let frame = Self::try_new(y, u, v, width, height, y_stride, u_stride, v_stride)?; + let max_valid: u16 = ((1u32 << BITS) - 1) as u16; + // Scan the declared-payload region of each plane. Stride may add + // unused padding past the declared width; we don't inspect that — + // callers often pass buffers whose padding bytes are arbitrary, + // and the kernels never read them. + let w = width as usize; + let h = height as usize; + let chroma_w = w / 2; + let chroma_h = height.div_ceil(2) as usize; + for row in 0..h { + let start = row * y_stride as usize; + for (col, &s) in y[start..start + w].iter().enumerate() { + if s > max_valid { + return Err(Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::Y, + index: start + col, + value: s, + max_valid, + }); + } + } + } + for row in 0..chroma_h { + let start = row * u_stride as usize; + for (col, &s) in u[start..start + chroma_w].iter().enumerate() { + if s > max_valid { + return Err(Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::U, + index: start + col, + value: s, + max_valid, + }); + } + } + } + for row in 0..chroma_h { + let start = row * v_stride as usize; + for (col, &s) in v[start..start + chroma_w].iter().enumerate() { + if s > max_valid { + return Err(Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::V, + index: start + col, + value: s, + max_valid, + }); + } + } + } + Ok(frame) + } + + /// Y (luma) plane samples. Row `r` starts at sample offset + /// `r * y_stride()`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn y(&self) -> &'a [u16] { + self.y + } + + /// U (Cb) plane samples. Row `r` starts at sample offset + /// `r * u_stride()`. U has half the width and half the height of the + /// frame (chroma row index for output row `r` is `r / 2`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn u(&self) -> &'a [u16] { + self.u + } + + /// V (Cr) plane samples. Row `r` starts at sample offset + /// `r * v_stride()`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn v(&self) -> &'a [u16] { + self.v + } + + /// Frame width in pixels. Always even. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn width(&self) -> u32 { + self.width + } + + /// Frame height in pixels. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn height(&self) -> u32 { + self.height + } + + /// Sample stride of the Y plane (`>= width`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn y_stride(&self) -> u32 { + self.y_stride + } + + /// Sample stride of the U plane (`>= width / 2`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn u_stride(&self) -> u32 { + self.u_stride + } + + /// Sample stride of the V plane (`>= width / 2`). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn v_stride(&self) -> u32 { + self.v_stride + } + + /// Active bit depth — 10, 12, or 14. Mirrors the `BITS` const + /// parameter so generic code can read it without naming the type. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn bits(&self) -> u32 { + BITS + } +} + +/// Type alias for a validated YUV 4:2:0 planar frame at 10 bits per +/// sample (`AV_PIX_FMT_YUV420P10LE`). Tight wrapper over +/// [`Yuv420pFrame16`] with `BITS == 10` — use this name at call sites +/// for readability. +pub type Yuv420p10Frame<'a> = Yuv420pFrame16<'a, 10>; + +/// Errors returned by [`Yuv420pFrame16::try_new`]. Variant shape +/// mirrors [`Yuv420pFrameError`], with `UnsupportedBits` added for +/// the new `BITS` parameter and all sizes expressed in **samples** +/// (`u16` elements) instead of bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)] +#[non_exhaustive] +pub enum Yuv420pFrame16Error { + /// `BITS` was not one of the supported depths (10, 12, 14). 8‑bit + /// frames should use [`Yuv420pFrame`]; 16‑bit needs a separate + /// kernel family (see [`Yuv420pFrame16`] docs). + #[error("unsupported BITS ({bits}) for Yuv420pFrame16; must be 10, 12, or 14")] + UnsupportedBits { + /// The unsupported value of the `BITS` const parameter. + bits: u32, + }, + /// `width` or `height` was zero. + #[error("width ({width}) or height ({height}) is zero")] + ZeroDimension { + /// The supplied width. + width: u32, + /// The supplied height. + height: u32, + }, + /// `width` was odd. Same 4:2:0 rationale as + /// [`Yuv420pFrameError::OddWidth`]. + #[error("width ({width}) is odd; YUV420p / 4:2:0 requires even width")] + OddWidth { + /// The supplied width. + width: u32, + }, + /// `y_stride < width` (in samples). + #[error("y_stride ({y_stride}) is smaller than width ({width})")] + YStrideTooSmall { + /// Declared frame width in pixels. + width: u32, + /// The supplied Y‑plane stride (samples). + y_stride: u32, + }, + /// `u_stride < ceil(width / 2)` (in samples). + #[error("u_stride ({u_stride}) is smaller than chroma width ({chroma_width})")] + UStrideTooSmall { + /// Required minimum chroma‑plane stride. + chroma_width: u32, + /// The supplied U‑plane stride (samples). + u_stride: u32, + }, + /// `v_stride < ceil(width / 2)` (in samples). + #[error("v_stride ({v_stride}) is smaller than chroma width ({chroma_width})")] + VStrideTooSmall { + /// Required minimum chroma‑plane stride. + chroma_width: u32, + /// The supplied V‑plane stride (samples). + v_stride: u32, + }, + /// Y plane is shorter than `y_stride * height` samples. + #[error("Y plane has {actual} samples but at least {expected} are required")] + YPlaneTooShort { + /// Minimum samples required. + expected: usize, + /// Actual samples supplied. + actual: usize, + }, + /// U plane is shorter than `u_stride * ceil(height / 2)` samples. + #[error("U plane has {actual} samples but at least {expected} are required")] + UPlaneTooShort { + /// Minimum samples required. + expected: usize, + /// Actual samples supplied. + actual: usize, + }, + /// V plane is shorter than `v_stride * ceil(height / 2)` samples. + #[error("V plane has {actual} samples but at least {expected} are required")] + VPlaneTooShort { + /// Minimum samples required. + expected: usize, + /// Actual samples supplied. + actual: usize, + }, + /// `stride * rows` overflows `usize` (32‑bit targets only). + #[error("declared geometry overflows usize: stride={stride} * rows={rows}")] + GeometryOverflow { + /// Stride of the plane whose size overflowed. + stride: u32, + /// Row count that overflowed against the stride. + rows: u32, + }, + /// A plane sample exceeds `(1 << BITS) - 1` — i.e., a bit above the + /// declared active depth is set. Only [`Yuv420pFrame16::try_new_checked`] + /// can produce this error; [`Yuv420pFrame16::try_new`] validates + /// geometry only and treats the low‑bit‑packing contract as an + /// expectation. Use the checked constructor for untrusted input + /// (e.g., a buffer that might be `p010`‑packed instead of + /// `yuv420p10le`‑packed). + #[error( + "sample {value} on plane {plane} at element {index} exceeds {max_valid} ((1 << BITS) - 1)" + )] + SampleOutOfRange { + /// Which plane the offending sample lives on. + plane: Yuv420pFrame16Plane, + /// Element index within that plane's slice. This is the raw + /// `&[u16]` index — it accounts for stride padding rows, so + /// `index / stride` is the row, `index % stride` is the + /// in‑row position. + index: usize, + /// The offending sample value. + value: u16, + /// The maximum allowed value for this `BITS` (`(1 << BITS) - 1`). + max_valid: u16, + }, +} + +/// Identifies which plane of a [`Yuv420pFrame16`] an error refers to. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)] +pub enum Yuv420pFrame16Plane { + /// Luma plane. + Y, + /// U (Cb) chroma plane. + U, + /// V (Cr) chroma plane. + V, +} + /// Errors returned by [`Yuv420pFrame::try_new`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)] #[non_exhaustive] @@ -1057,4 +1541,227 @@ mod tests { let e = Nv21Frame::try_new(&y, &vu, big, big, big, big).unwrap_err(); assert!(matches!(e, Nv21FrameError::GeometryOverflow { .. })); } + + // ---- Yuv420pFrame16 / Yuv420p10Frame ---------------------------------- + // + // Storage is `&[u16]` with sample-indexed strides. Validation mirrors + // the 8-bit [`Yuv420pFrame`] with the addition of the `BITS` guard. + + fn p10_planes() -> (std::vec::Vec, std::vec::Vec, std::vec::Vec) { + // 16×8 frame, chroma 8×4. Y plane solid black (Y=0); UV planes + // neutral (UV=512 = 10‑bit chroma center). Exact sample values + // don't matter for the constructor tests that use this helper — + // they only look at shape, geometry errors, and the reported + // bits. + ( + std::vec![0u16; 16 * 8], + std::vec![512u16; 8 * 4], + std::vec![512u16; 8 * 4], + ) + } + + #[test] + fn yuv420p10_try_new_accepts_valid_tight() { + let (y, u, v) = p10_planes(); + let f = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("valid"); + assert_eq!(f.width(), 16); + assert_eq!(f.height(), 8); + assert_eq!(f.bits(), 10); + } + + #[test] + fn yuv420p10_try_new_accepts_odd_height() { + // 16x9 → chroma_height = 5. Y plane 16*9 = 144 samples, U/V 8*5 = 40. + let y = std::vec![0u16; 16 * 9]; + let u = std::vec![512u16; 8 * 5]; + let v = std::vec![512u16; 8 * 5]; + let f = Yuv420p10Frame::try_new(&y, &u, &v, 16, 9, 16, 8, 8).expect("odd height valid"); + assert_eq!(f.height(), 9); + } + + #[test] + fn yuv420p10_try_new_rejects_odd_width() { + let (y, u, v) = p10_planes(); + let e = Yuv420p10Frame::try_new(&y, &u, &v, 15, 8, 16, 8, 8).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::OddWidth { width: 15 })); + } + + #[test] + fn yuv420p10_try_new_rejects_zero_dim() { + let (y, u, v) = p10_planes(); + let e = Yuv420p10Frame::try_new(&y, &u, &v, 0, 8, 16, 8, 8).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::ZeroDimension { .. })); + } + + #[test] + fn yuv420p10_try_new_rejects_short_y_plane() { + let y = std::vec![0u16; 10]; + let u = std::vec![512u16; 8 * 4]; + let v = std::vec![512u16; 8 * 4]; + let e = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. })); + } + + #[test] + fn yuv420p10_try_new_rejects_short_u_plane() { + let y = std::vec![0u16; 16 * 8]; + let u = std::vec![512u16; 4]; + let v = std::vec![512u16; 8 * 4]; + let e = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::UPlaneTooShort { .. })); + } + + #[test] + fn yuv420p16_try_new_rejects_unsupported_bits() { + // BITS == 9 is not in {10, 12, 14}; the constructor must reject it + // before any plane math runs. This also exercises the 12/14 path + // at the validation layer even though Ship 2 only ships a 10-bit + // alias. + let y = std::vec![0u16; 16 * 8]; + let u = std::vec![128u16; 8 * 4]; + let v = std::vec![128u16; 8 * 4]; + let e = Yuv420pFrame16::<9>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::UnsupportedBits { bits: 9 } + )); + + let e16 = Yuv420pFrame16::<16>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e16, + Yuv420pFrame16Error::UnsupportedBits { bits: 16 } + )); + } + + #[test] + fn yuv420p16_try_new_accepts_12_and_14() { + // The constructor admits 12 and 14 — Ship 2 doesn't ship kernels + // for them but the geometry validator shouldn't block the types. + let y = std::vec![0u16; 16 * 8]; + let u = std::vec![2048u16; 8 * 4]; + let v = std::vec![2048u16; 8 * 4]; + let f12 = Yuv420pFrame16::<12>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("12-bit valid"); + assert_eq!(f12.bits(), 12); + let f14 = Yuv420pFrame16::<14>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("14-bit valid"); + assert_eq!(f14.bits(), 14); + } + + #[test] + #[should_panic(expected = "invalid Yuv420pFrame16")] + fn yuv420p10_new_panics_on_invalid() { + let y = std::vec![0u16; 10]; + let u = std::vec![512u16; 8 * 4]; + let v = std::vec![512u16; 8 * 4]; + let _ = Yuv420p10Frame::new(&y, &u, &v, 16, 8, 16, 8, 8); + } + + #[cfg(target_pointer_width = "32")] + #[test] + fn yuv420p10_try_new_rejects_geometry_overflow() { + // Sample count overflow on 32-bit. Same rationale as the 8-bit + // version — strides are in `u16` elements here, so the same + // `0x1_0000 * 0x1_0000` product overflows `usize`. + let big: u32 = 0x1_0000; + let y: [u16; 0] = []; + let u: [u16; 0] = []; + let v: [u16; 0] = []; + let e = Yuv420p10Frame::try_new(&y, &u, &v, big, big, big, big / 2, big / 2).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::GeometryOverflow { .. })); + } + + #[test] + fn yuv420p10_try_new_checked_accepts_in_range_samples() { + // Same valid frame as `yuv420p10_try_new_accepts_valid_tight`, + // but run through the checked constructor. All samples live in + // the 10‑bit range. + let (y, u, v) = p10_planes(); + let f = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).expect("valid"); + assert_eq!(f.width(), 16); + assert_eq!(f.bits(), 10); + } + + #[test] + fn yuv420p10_try_new_checked_rejects_y_high_bit_set() { + // A Y sample with bit 15 set — typical of `p010` packing where + // the 10 active bits sit in the high bits. `try_new` would + // accept this and let the SIMD kernels produce arch‑dependent + // garbage; `try_new_checked` catches it up front. + let mut y = std::vec![0u16; 16 * 8]; + y[3 * 16 + 5] = 0x8000; // bit 15 set → way above 1023 + let u = std::vec![512u16; 8 * 4]; + let v = std::vec![512u16; 8 * 4]; + let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + match e { + Yuv420pFrame16Error::SampleOutOfRange { + plane, + value, + max_valid, + .. + } => { + assert_eq!(plane, Yuv420pFrame16Plane::Y); + assert_eq!(value, 0x8000); + assert_eq!(max_valid, 1023); + } + other => panic!("expected SampleOutOfRange, got {other:?}"), + } + } + + #[test] + fn yuv420p10_try_new_checked_rejects_u_plane_sample() { + // Offending sample in the U plane — error must name U, not Y or V. + let y = std::vec![0u16; 16 * 8]; + let mut u = std::vec![512u16; 8 * 4]; + u[2 * 8 + 3] = 1024; // just above max + let v = std::vec![512u16; 8 * 4]; + let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::U, + value: 1024, + max_valid: 1023, + .. + } + )); + } + + #[test] + fn yuv420p10_try_new_checked_rejects_v_plane_sample() { + let y = std::vec![0u16; 16 * 8]; + let u = std::vec![512u16; 8 * 4]; + let mut v = std::vec![512u16; 8 * 4]; + v[1 * 8 + 7] = 0xFFFF; // all bits set + let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::V, + max_valid: 1023, + .. + } + )); + } + + #[test] + fn yuv420p10_try_new_checked_accepts_exact_max_sample() { + // Boundary: sample value == (1 << BITS) - 1 is valid. + let mut y = std::vec![0u16; 16 * 8]; + y[0] = 1023; + let u = std::vec![512u16; 8 * 4]; + let v = std::vec![512u16; 8 * 4]; + Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).expect("1023 is in range"); + } + + #[test] + fn yuv420p10_try_new_checked_reports_geometry_errors_first() { + // If geometry is invalid, we never get to the sample scan — the + // same errors as `try_new` surface first. Prevents the checked + // path from doing unnecessary O(N) work on inputs that would + // fail for a simpler reason. + let y = std::vec![0u16; 10]; // Too small. + let u = std::vec![512u16; 8 * 4]; + let v = std::vec![512u16; 8 * 4]; + let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. })); + } } diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index c8fb3e9..5cefb9e 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -33,13 +33,14 @@ //! 8. Saturate‑narrow to u8x16 and interleave with `vst3q_u8`. use core::arch::aarch64::{ - float32x4_t, int16x8_t, int32x4_t, uint8x16_t, uint8x16x3_t, vaddq_f32, vaddq_s32, vbslq_f32, - vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16, vcvtq_f32_u32, vcvtq_u32_f32, - vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vget_high_s16, vget_high_u8, vget_high_u16, - vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, vld1q_u8, vld2_u8, vld3q_u8, vmaxq_f32, - vminq_f32, vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, - vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vshrq_n_s32, vst1q_u8, vst3q_u8, - vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, + float32x4_t, int16x8_t, int32x4_t, uint8x16_t, uint8x16x3_t, uint16x8_t, uint16x8x3_t, vaddq_f32, + vaddq_s32, vandq_u16, vbslq_f32, vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16, + vcvtq_f32_u32, vcvtq_u32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_u16, + vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, + vld1q_u8, vld1q_u16, vld2_u8, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16, + vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, vqaddq_s16, + vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, vst1q_u8, + vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16, }; use crate::{ColorMatrix, row::scalar}; @@ -189,6 +190,304 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } +/// NEON YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// +/// Block size is 16 Y pixels / 8 chroma pairs per iteration. The +/// pipeline mirrors [`yuv_420_to_rgb_row`] byte‑for‑byte; the only +/// structural differences are: +/// - Loads are `vld1q_u16` (8 lanes of `u16`) instead of `vld1q_u8` +/// (16 lanes of `u8`), so each Y iteration needs two Y loads to +/// cover 16 pixels — there's no widening step because the samples +/// already live in 16‑bit lanes. +/// - Chroma bias is **512** (10‑bit center) rather than 128. +/// - Range‑scaling params come from [`scalar::range_params_n`] with +/// `BITS = 10, OUT_BITS = 8`, so `y_scale` / `c_scale` are ~¼ the +/// 8‑bit values (mapping 10‑bit input to 8‑bit output). +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`] — every +/// Q15 multiply / shift mirrors the scalar path exactly, with the +/// same `(prod + (1 << 14)) >> 15` rounding. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: NEON availability is the caller's obligation; the + // dispatcher in `crate::row` verifies it. Pointer adds are bounded + // by the `while x + 16 <= width` loop condition and the caller‑ + // promised slice lengths checked above. + unsafe { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // Two Y loads cover 16 lanes; one U load + one V load cover 8 + // chroma each. Each load is AND‑masked to the low 10 bits so + // out‑of‑range samples (e.g. `p010`‑style packing with the + // 10 active bits in the high 10 of each u16) can never push + // an intermediate past i16 range. For valid input the AND is + // a no‑op (samples already in [0, 1023]). + let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); + let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); + let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); + let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v); + + let y_lo = vreinterpretq_s16_u16(y_vec_lo); + let y_hi = vreinterpretq_s16_u16(y_vec_hi); + + // c - 512 for 10‑bit chroma, fits i16 since c ≤ 1023. + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + // Widen to i32x4 halves so the Q15 multiplies don't overflow. + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Duplicate the 8 chroma lanes into 16‑lane pairs — identical + // nearest‑neighbor upsample strategy as the 8‑bit kernel. + let r_dup_lo = vzip1q_s16(r_chroma, r_chroma); + let r_dup_hi = vzip2q_s16(r_chroma, r_chroma); + let g_dup_lo = vzip1q_s16(g_chroma, g_chroma); + let g_dup_hi = vzip2q_s16(g_chroma, g_chroma); + let b_dup_lo = vzip1q_s16(b_chroma, b_chroma); + let b_dup_hi = vzip2q_s16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); + + // u8 output: saturate‑narrow i16 → u8 clamps to [0, 255]. + let b_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, b_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)), + ); + let g_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, g_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, g_dup_hi)), + ); + let r_u8 = vcombine_u8( + vqmovun_s16(vqaddq_s16(y_scaled_lo, r_dup_lo)), + vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), + ); + + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } + + // Scalar tail — remaining < 16 pixels (always even per 4:2:0). + if x < width { + scalar::yuv_420p_n_to_rgb_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// NEON YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB (native depth). +/// +/// Block size is 16 Y pixels / 8 chroma pairs per iteration. Shares +/// all pre‑write math with [`yuv420p10_to_rgb_row`]; the only +/// difference is the final clamp + write: +/// - Y‑path scale is calibrated for `OUT_BITS = 10` rather than 8, +/// so `y_scaled` lives in `[0, 1023]` before the chroma add. +/// - The `y_scaled + chroma` sum is clamped to `[0, 1023]` with +/// `vmaxq_s16(vminq_s16(_, 1023), 0)` — a simple saturate‑narrow +/// doesn't suffice because the sum can overshoot 1023 (up to ~2046 +/// without saturating at i16 bounds). +/// - Writes use two `vst3q_u16` calls per iteration — each handles 8 +/// pixels × 3 channels = 24 `u16` elements, so two cover 16 pixels. +/// +/// # Numerical contract +/// +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`] — every +/// Q15 multiply / shift / clamp mirrors the scalar reference. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: NEON availability is the caller's obligation; the + // dispatcher in `crate::row` verifies it. Pointer adds are bounded + // by the `while x + 16 <= width` loop condition and the caller‑ + // promised slice lengths. + unsafe { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let mask_v = vdupq_n_u16(scalar::bits_mask::<10>()); + let max_v = vdupq_n_s16(OUT_MAX_10); + let zero_v = vdupq_n_s16(0); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // AND‑mask each load to the low 10 bits so intermediates stay + // within the i16 range the Q15 narrow steps expect — see + // matching comment in [`yuv420p10_to_rgb_row`]. + let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); + let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); + let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); + let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v); + + let y_lo = vreinterpretq_s16_u16(y_vec_lo); + let y_hi = vreinterpretq_s16_u16(y_vec_hi); + + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = vzip1q_s16(r_chroma, r_chroma); + let r_dup_hi = vzip2q_s16(r_chroma, r_chroma); + let g_dup_lo = vzip1q_s16(g_chroma, g_chroma); + let g_dup_hi = vzip2q_s16(g_chroma, g_chroma); + let b_dup_lo = vzip1q_s16(b_chroma, b_chroma); + let b_dup_hi = vzip2q_s16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v); + + // Native‑depth output: add Y + chroma in i16, then clamp to + // [0, 1023] explicitly. `vqaddq_s16` saturates at i16 bounds + // (irrelevant here since |sum| < 2047 always), so the subsequent + // max/min clamps to the 10‑bit range. + let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10(vqaddq_s16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels. + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + + x += 16; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Clamps an i16x8 vector to `[0, max]` and reinterprets to u16x8. +/// Used by the 10‑bit u16 output path to avoid `vqmovun_s16`'s u8 +/// saturation. +#[inline(always)] +fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t { + unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) } +} + /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the /// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1124,4 +1423,200 @@ mod tests { assert_eq!(input, back, "swap is not self-inverse"); } + + // ---- yuv420p10 scalar-equivalence ----------------------------------- + + /// Deterministic pseudo‑random `u16` samples in `[0, 1023]` — the + /// 10‑bit range. Upper 6 bits always zero, so the generator matches + /// real `yuv420p10le` bit patterns. + fn p10_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16) + .collect() + } + + fn check_p10_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + + if rgb_scalar != rgb_neon { + let first_diff = rgb_scalar + .iter() + .zip(rgb_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "NEON 10→u8 diverges from scalar at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgb_scalar[first_diff], rgb_neon[first_diff] + ); + } + } + + fn check_p10_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_neon = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + + if rgb_scalar != rgb_neon { + let first_diff = rgb_scalar + .iter() + .zip(rgb_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "NEON 10→u16 diverges from scalar at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgb_scalar[first_diff], rgb_neon[first_diff] + ); + } + } + + #[test] + fn neon_p10_u8_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u8_equivalence(16, m, full); + } + } + } + + #[test] + fn neon_p10_u16_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u16_equivalence(16, m, full); + } + } + } + + #[test] + fn neon_p10_matches_scalar_odd_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p10_u8_equivalence(w, ColorMatrix::Bt601, false); + check_p10_u16_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn neon_p10_matches_scalar_1920() { + check_p10_u8_equivalence(1920, ColorMatrix::Bt709, false); + check_p10_u16_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } + + /// Out‑of‑range regression: every kernel AND‑masks each `u16` load + /// to the low `BITS` bits, so **arbitrary** upper‑bit corruption + /// (not just p010 packing) produces scalar/NEON bit‑identical + /// output. This test sweeps three adversarial input shapes: + /// + /// - `p010`: 10 active bits in the high 10 of each `u16` + /// (`sample << 6`) — the canonical mispacking mistake. + /// - `ycgco_worst`: `Y=[0x8000; W]`, `U=[0; W/2]`, `V=[0x8000; W/2]` + /// — the specific Codex‑identified case that used to produce + /// `(1023, 0, 0)` on scalar vs `(0, 0, 0)` on NEON before the + /// load‑time mask was added. + /// - `random`: arbitrary upper‑bit flips with no particular pattern. + /// + /// Each variant runs through every color matrix × range × both + /// output paths (u8 + native‑depth u16) and asserts byte equality. + #[test] + fn neon_p10_matches_scalar_on_out_of_range_samples() { + let width = 32; + + let p010_variant = + |i: usize, seed: u16| 0xFC00u16.wrapping_add(((i as u16).wrapping_mul(seed)) << 6); + let random_variant = |i: usize, seed: u16| { + let x = (i as u32) + .wrapping_mul(seed as u32) + .wrapping_add(0xDEAD_BEEF) as u16; + x ^ 0xA5A5 + }; + + for variant_name in ["p010", "ycgco_worst", "random"] { + let y: std::vec::Vec = match variant_name { + "ycgco_worst" => std::vec![0x8000u16; width], + "p010" => (0..width).map(|i| p010_variant(i, 37)).collect(), + _ => (0..width).map(|i| random_variant(i, 37)).collect(), + }; + let u: std::vec::Vec = match variant_name { + "ycgco_worst" => std::vec![0x0u16; width / 2], + "p010" => (0..width / 2).map(|i| p010_variant(i, 53)).collect(), + _ => (0..width / 2).map(|i| random_variant(i, 53)).collect(), + }; + let v: std::vec::Vec = match variant_name { + "ycgco_worst" => std::vec![0x8000u16; width / 2], + "p010" => (0..width / 2).map(|i| p010_variant(i, 71)).collect(), + _ => (0..width / 2).map(|i| random_variant(i, 71)).collect(), + }; + + for matrix in [ColorMatrix::Bt601, ColorMatrix::Bt709, ColorMatrix::YCgCo] { + for full_range in [true, false] { + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_neon = std::vec![0u8; width * 3]; + scalar::yuv_420p_n_to_rgb_row::<10>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + } + assert_eq!( + rgb_scalar, rgb_neon, + "scalar and NEON diverge on {variant_name} input (matrix={matrix:?}, full_range={full_range})" + ); + + let mut rgb16_scalar = std::vec![0u16; width * 3]; + let mut rgb16_neon = std::vec![0u16; width * 3]; + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y, + &u, + &v, + &mut rgb16_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); + } + assert_eq!( + rgb16_scalar, rgb16_neon, + "scalar and NEON diverge on {variant_name} u16 output (matrix={matrix:?}, full_range={full_range})" + ); + } + } + } + } } diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 4a092b9..8b0175c 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -37,11 +37,12 @@ use core::arch::wasm32::{ f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_eq, f32x4_lt, f32x4_max, f32x4_min, f32x4_mul, - f32x4_splat, f32x4_sub, i8x16, i8x16_shuffle, i16x8_add_sat, i16x8_narrow_i32x4, i16x8_splat, - i16x8_sub, i32x4_add, i32x4_extend_high_i16x8, i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr, - i32x4_splat, i32x4_trunc_sat_f32x4, u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16, - u16x8_extend_low_u8x16, u16x8_load_extend_u8x8, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, - v128, v128_bitselect, v128_load, v128_or, v128_store, + f32x4_splat, f32x4_sub, i8x16, i8x16_shuffle, i16x8_add_sat, i16x8_max, i16x8_min, + i16x8_narrow_i32x4, i16x8_splat, i16x8_sub, i32x4_add, i32x4_extend_high_i16x8, + i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr, i32x4_splat, i32x4_trunc_sat_f32x4, + u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16, u16x8_extend_low_u8x16, + u16x8_load_extend_u8x8, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, v128, + v128_and, v128_bitselect, v128_load, v128_or, v128_store, }; use crate::{ColorMatrix, row::scalar}; @@ -188,6 +189,317 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } +/// WASM simd128 YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// +/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences +/// from [`yuv_420_to_rgb_row`]: +/// - Y loads are two `v128_load` (each holds 8 `u16` = 16 bytes); U / V +/// each one `v128_load` (8 `u16`). +/// - No u8→u16 widening — samples already in 16‑bit lanes. +/// - Chroma bias 512 (10‑bit center). +/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift. +/// +/// Reuses [`chroma_i16x8`], [`dup_lo`], [`dup_hi`], [`scale_y`], and +/// [`write_rgb_16`]. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`]. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: simd128 compile‑time availability is the caller's + // obligation. + unsafe { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + let mask_v = u16x8_splat(scalar::bits_mask::<10>()); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // AND‑mask each load to the low 10 bits — see matching comment + // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. + let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); + let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v); + let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v); + + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = dup_lo(r_chroma); + let r_dup_hi = dup_hi(r_chroma); + let g_dup_lo = dup_lo(g_chroma); + let g_dup_hi = dup_hi(g_chroma); + let b_dup_lo = dup_lo(b_chroma); + let b_dup_hi = dup_hi(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = i16x8_add_sat(y_scaled_lo, b_dup_lo); + let b_hi = i16x8_add_sat(y_scaled_hi, b_dup_hi); + let g_lo = i16x8_add_sat(y_scaled_lo, g_dup_lo); + let g_hi = i16x8_add_sat(y_scaled_hi, g_dup_hi); + let r_lo = i16x8_add_sat(y_scaled_lo, r_dup_lo); + let r_hi = i16x8_add_sat(y_scaled_hi, r_dup_hi); + + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + + write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 16; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// WASM simd128 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB. +/// +/// Block 16 Y pixels. Mirrors [`yuv420p10_to_rgb_row`]'s pre‑write +/// math; output uses explicit `i16x8_min` / `i16x8_max` clamp to +/// `[0, 1023]` and two calls to [`write_rgb_u16_8`] per block. +/// +/// # Numerical contract +/// +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`]. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: simd128 compile‑time availability is the caller's + // obligation. + unsafe { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + let mask_v = u16x8_splat(scalar::bits_mask::<10>()); + let max_v = i16x8_splat(OUT_MAX_10); + let zero_v = i16x8_splat(0); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // AND‑mask loads to the low 10 bits so `chroma_i16x8`'s + // `i16x8_narrow_i32x4` stays lossless. + let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); + let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v); + let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v); + + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = dup_lo(r_chroma); + let r_dup_hi = dup_hi(r_chroma); + let g_dup_lo = dup_lo(g_chroma); + let g_dup_hi = dup_hi(g_chroma); + let b_dup_lo = dup_lo(b_chroma); + let b_dup_hi = dup_hi(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let r_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v); + + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + + x += 16; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Clamps an i16x8 vector to `[0, max]`. Used by the 10‑bit u16 +/// output path. +#[inline(always)] +fn clamp_u10_wasm(v: v128, zero_v: v128, max_v: v128) -> v128 { + i16x8_min(i16x8_max(v, zero_v), max_v) +} + +/// Writes 8 pixels of packed `u16` RGB (24 `u16` = 48 bytes) using +/// the SSSE3‑style 3‑way interleave pattern adapted to 16‑bit lanes. +/// Mirrors [`crate::row::arch::x86_common::write_rgb_u16_8`] — each +/// output u16 is two adjacent bytes sourced from one of the three +/// channel vectors via `u8x16_swizzle` with a compile‑time byte +/// mask (0xFF / negative zeros the lane, matching `_mm_shuffle_epi8` +/// semantics). +/// +/// # Safety +/// +/// `ptr` must point to at least 48 writable bytes (24 `u16`). Caller +/// must have simd128 enabled at compile time. +#[inline(always)] +unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { + unsafe { + // Block 0 = [R0 G0 B0 R1 G1 B1 R2 G2]. Masks identical in shape + // to x86_common::write_rgb_u16_8 — each output u16 pulls two + // adjacent bytes from one channel. + let r0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r0), u8x16_swizzle(g, g0)), + u8x16_swizzle(b, b0), + ); + + // Block 1 = [B2 R3 G3 B3 R4 G4 B4 R5]. + let r1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r1), u8x16_swizzle(g, g1)), + u8x16_swizzle(b, b1), + ); + + // Block 2 = [G5 B5 R6 G6 B6 R7 G7 B7]. + let r2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r2), u8x16_swizzle(g, g2)), + u8x16_swizzle(b, b2), + ); + + v128_store(ptr.cast(), out0); + v128_store(ptr.add(8).cast(), out1); + v128_store(ptr.add(16).cast(), out2); + } +} + /// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper /// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -1008,4 +1320,118 @@ mod tests { check_hsv_equivalence(&rgb[..w * 3], w); } } + + // ---- yuv420p10 scalar-equivalence ----------------------------------- + // + // These tests compile only for `target_arch = "wasm32"` (via the + // outer `target_feature = "simd128"` gate on the module). CI + // executes them under wasmtime in the `test-wasm-simd128` job + // (see `.github/workflows/ci.yml`): the lib is compiled for + // `wasm32-wasip1` with `-C target-feature=+simd128` and + // `CARGO_TARGET_WASM32_WASIP1_RUNNER=wasmtime run --` passes each + // compiled `.wasm` test binary to wasmtime. Every scalar‑ + // equivalence check below runs on real SIMD instructions, not + // just a compile check. + + fn p10_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16) + .collect() + } + + fn check_p10_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "simd128 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + fn check_p10_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "simd128 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + #[test] + fn simd128_p10_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u8_simd128_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_p10_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u16_simd128_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_p10_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p10_u8_simd128_equivalence(w, ColorMatrix::Bt601, false); + check_p10_u16_simd128_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn simd128_p10_matches_scalar_1920() { + check_p10_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false); + check_p10_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index beb3b6f..d1a1710 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -39,18 +39,18 @@ //! element order. Every fixup is called out inline. use core::arch::x86_64::{ - __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_castsi256_si128, - _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, _mm256_loadu_si256, - _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256, - _mm256_permute4x64_epi64, _mm256_set1_epi16, _mm256_set1_epi32, _mm256_setr_epi8, - _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16, _mm256_unpackhi_epi16, - _mm256_unpacklo_epi16, + __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_and_si256, + _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, + _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32, + _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16, + _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16, + _mm256_unpackhi_epi16, _mm256_unpacklo_epi16, }; use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16}, + arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, scalar, }, }; @@ -207,6 +207,325 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } +/// AVX2 YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// +/// Block size 32 Y pixels per iteration (matching the 8‑bit AVX2 +/// kernel). Key differences: +/// - Two `_mm256_loadu_si256` loads for Y (each 16 `u16` = 32 bytes); +/// one load each for U / V (16 `u16` = 32 bytes). +/// - No u8→i16 widening — 10‑bit samples already occupy 16‑bit lanes +/// and fit i16 without overflow. +/// - Chroma bias is 512 (10‑bit center). +/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift. +/// +/// Reuses [`chroma_i16x16`], [`chroma_dup`], [`scale_y`], +/// [`narrow_u8x32`], and [`write_rgb_32`] from the 8‑bit path — the +/// post‑chroma math is identical across bit depths. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`]. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: AVX2 availability is the caller's obligation. + unsafe { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 32 <= width { + // 32 Y = two `_mm256_loadu_si256` (16 u16 each). U/V each = one + // load of 16 u16. AND‑mask each load to the low 10 bits — see + // matching comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. + let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); + let u_vec = _mm256_and_si256( + _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + let v_vec = _mm256_and_si256( + _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = _mm256_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm256_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm256_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm256_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm256_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm256_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = narrow_u8x32(b_lo, b_hi); + let g_u8 = narrow_u8x32(g_lo, g_hi); + let r_u8 = narrow_u8x32(r_lo, r_hi); + + write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 32; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// AVX2 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB. +/// +/// Block size 32 Y pixels. Mirrors [`yuv420p10_to_rgb_row`]'s +/// pre‑write math; output uses explicit min/max clamp to `[0, 1023]` +/// (`_mm256_packus_epi16` would clip to u8). Writes are issued via +/// four `write_rgb_u16_8` calls per 32‑pixel block — each extracts a +/// 128‑bit half of the AVX2 `i16x16` channel vectors and hands them +/// to the shared SSE4.1 u16 interleave helper. A 256‑bit AVX2 u16 +/// interleave would cut store count in half; left as a follow‑up +/// optimization, since the u16 path is fidelity‑driven rather than +/// throughput‑critical. +/// +/// # Numerical contract +/// +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`]. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: AVX2 availability is the caller's obligation. + unsafe { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16); + let max_v = _mm256_set1_epi16(OUT_MAX_10); + let zero_v = _mm256_set1_epi16(0); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 32 <= width { + // AND‑mask loads to the low 10 bits so `chroma_i16x16`'s + // `_mm256_packs_epi32` narrow stays lossless. + let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); + let u_vec = _mm256_and_si256( + _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + let v_vec = _mm256_and_si256( + _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + // Per‑channel saturating add + explicit clamp to [0, 1023]. + let r_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + // Four 8‑pixel u16 writes per 32‑pixel block. Each extracts a + // 128‑bit half of an i16x16 channel and hands it to the shared + // SSE4.1 u16 interleave helper. + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + + x += 32; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Clamps an `i16x16` vector to `[0, max]` via AVX2 `_mm256_min_epi16` +/// / `_mm256_max_epi16`. Used by the 10‑bit u16 output path where +/// `_mm256_packus_epi16` would incorrectly clip to u8. +#[inline(always)] +fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { + unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) } +} + /// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -946,4 +1265,114 @@ mod tests { check_hsv_equivalence(&rgb[..w * 3], w); } } + + // ---- yuv420p10 AVX2 scalar-equivalence ------------------------------ + + fn p10_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16) + .collect() + } + + fn check_p10_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "AVX2 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + fn check_p10_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "AVX2 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + #[test] + fn avx2_p10_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u8_avx2_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_p10_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u16_avx2_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_p10_matches_scalar_odd_tail_widths() { + for w in [34usize, 62, 66, 1922] { + check_p10_u8_avx2_equivalence(w, ColorMatrix::Bt601, false); + check_p10_u16_avx2_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn avx2_p10_matches_scalar_1920() { + check_p10_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false); + check_p10_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index f0f1f93..0c8f941 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -54,8 +54,9 @@ use core::arch::x86_64::{ __m128i, __m512i, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, _mm512_adds_epi16, - _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256, _mm512_cvtepi16_epi32, - _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, _mm512_extracti64x4_epi64, _mm512_loadu_si512, + _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256, + _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, + _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16, _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64, _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64, _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_sub_epi16, _mm512_unpackhi_epi16, @@ -65,7 +66,7 @@ use core::arch::x86_64::{ use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16}, + arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, scalar, }, }; @@ -223,6 +224,354 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( } } +/// AVX‑512 YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// +/// Block size 64 Y pixels / 32 chroma pairs per iteration (matching +/// the 8‑bit AVX‑512 kernel). Structural differences: +/// - Two `_mm512_loadu_si512` loads for Y (each 32 `u16` = 64 bytes); +/// one `_mm512_loadu_si512` each for U / V (32 `u16`). +/// - No u8→i16 widening — 10‑bit samples already occupy 16‑bit lanes. +/// - Chroma bias is 512 (10‑bit center). +/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift. +/// +/// Reuses [`chroma_i16x32`], [`chroma_dup`], [`scale_y`], +/// [`narrow_u8x64`], and [`write_rgb_64`] along with the pack / dup +/// lane‑fixup indices from the 8‑bit path — post‑chroma math is +/// identical across bit depths. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`]. +/// +/// # Safety +/// +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: AVX‑512BW availability is the caller's obligation. + unsafe { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + let mut x = 0usize; + while x + 64 <= width { + // AND‑mask every load to the low 10 bits — see matching + // comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. + let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); + let u_vec = _mm512_and_si512( + _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + let v_vec = _mm512_and_si512( + _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + let b_lo = _mm512_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm512_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm512_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm512_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm512_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm512_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); + let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); + let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); + + write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 64; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// AVX‑512 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB. +/// +/// Block size 64 Y pixels per iteration. Mirrors +/// [`yuv420p10_to_rgb_row`]'s pre‑write math; output uses explicit +/// min/max clamp to `[0, 1023]` and 8 calls to [`write_rgb_u16_8`] +/// per block (each handles 8 pixels). A true AVX‑512 u16 interleave +/// would cut store count ~8×; left as a follow‑up optimization. +/// +/// # Numerical contract +/// +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`]. +/// +/// # Safety +/// +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: AVX‑512BW availability is the caller's obligation. + unsafe { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16); + let max_v = _mm512_set1_epi16(OUT_MAX_10); + let zero_v = _mm512_set1_epi16(0); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + let mut x = 0usize; + while x + 64 <= width { + // AND‑mask loads to the low 10 bits so `chroma_i16x32`'s + // `_mm512_packs_epi32` narrow stays lossless. + let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); + let u_vec = _mm512_and_si512( + _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + let v_vec = _mm512_and_si512( + _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()), + mask_v, + ); + + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + let r_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + // Eight 8‑pixel u16 writes per 64‑pixel block. For each i16x32 + // channel vector we extract four 128‑bit quarters and hand each + // to the shared SSE4.1 u16 interleave helper. + let dst = rgb_out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + + x += 64; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Clamps an `i16x32` vector to `[0, max]` via AVX‑512 +/// `_mm512_min_epi16` / `_mm512_max_epi16`. Used by the 10‑bit u16 +/// output path. +#[inline(always)] +fn clamp_u10_x32(v: __m512i, zero_v: __m512i, max_v: __m512i) -> __m512i { + unsafe { _mm512_min_epi16(_mm512_max_epi16(v, zero_v), max_v) } +} + +/// Writes one 8‑pixel u16 RGB chunk using a 128‑bit quarter of each +/// `i16x32` channel vector. `idx` ∈ `{0,1,2,3}` selects which of the +/// four 128‑bit lanes to extract via `_mm512_extracti32x4_epi32`. +/// +/// # Safety +/// +/// Same as [`write_rgb_u16_8`] — `ptr` must point to at least 48 +/// writable bytes (24 `u16`). Caller's `target_feature` must include +/// AVX‑512F + AVX‑512BW (so `_mm512_extracti32x4_epi32` is available) +/// and SSSE3 (for the underlying `_mm_shuffle_epi8` inside +/// `write_rgb_u16_8`). +#[inline(always)] +unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u16) { + // SAFETY: caller holds the AVX‑512F + SSSE3 target‑feature context. + // Constant generic arg `IDX` picks one of four 128‑bit lanes; `idx` + // is bounded to 0..=3 by call sites. + unsafe { + let (rq, gq, bq) = match idx { + 0 => ( + _mm512_extracti32x4_epi32::<0>(r), + _mm512_extracti32x4_epi32::<0>(g), + _mm512_extracti32x4_epi32::<0>(b), + ), + 1 => ( + _mm512_extracti32x4_epi32::<1>(r), + _mm512_extracti32x4_epi32::<1>(g), + _mm512_extracti32x4_epi32::<1>(b), + ), + 2 => ( + _mm512_extracti32x4_epi32::<2>(r), + _mm512_extracti32x4_epi32::<2>(g), + _mm512_extracti32x4_epi32::<2>(b), + ), + _ => ( + _mm512_extracti32x4_epi32::<3>(r), + _mm512_extracti32x4_epi32::<3>(g), + _mm512_extracti32x4_epi32::<3>(b), + ), + }; + write_rgb_u16_8(rq, gq, bq, ptr); + } +} + /// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. /// @@ -966,4 +1315,114 @@ mod tests { check_hsv_equivalence(&rgb[..w * 3], w); } } + + // ---- yuv420p10 AVX-512 scalar-equivalence --------------------------- + + fn p10_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16) + .collect() + } + + fn check_p10_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "AVX-512 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + fn check_p10_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "AVX-512 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + #[test] + fn avx512_p10_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u8_avx512_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_p10_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u16_avx512_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_p10_matches_scalar_odd_tail_widths() { + for w in [66usize, 126, 130, 1922] { + check_p10_u8_avx512_equivalence(w, ColorMatrix::Bt601, false); + check_p10_u16_avx512_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn avx512_p10_matches_scalar_1920() { + check_p10_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false); + check_p10_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs index b78827d..14dc783 100644 --- a/src/row/arch/x86_common.rs +++ b/src/row/arch/x86_common.rs @@ -84,6 +84,74 @@ pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut } } +/// Writes 8 pixels of packed **`u16`** RGB (48 bytes = 24 `u16`) +/// from three `u16x8` channel vectors. Drives the SSE4.1 / AVX2 / +/// AVX‑512 high‑bit‑depth kernels' u16 output path. +/// +/// Three output blocks of 16 bytes (8 `u16`) each hold: +/// - Block 0: `R0, G0, B0, R1, G1, B1, R2, G2` (u16 indices 0..7) +/// - Block 1: `B2, R3, G3, B3, R4, G4, B4, R5` +/// - Block 2: `G5, B5, R6, G6, B6, R7, G7, B7` +/// +/// Each block is the OR of three `_mm_shuffle_epi8` gathers — one +/// from each of R, G, B — with the byte mask picking a pair of +/// adjacent bytes (lo, hi) for every `u16` sourced from that +/// channel. 0x80 (`-1` as i8) zeros the lane, to be OR'd in by +/// another channel's contribution. +/// +/// # Safety +/// +/// - `ptr` must point to at least 48 writable bytes (aligned or +/// unaligned — we use `storeu`). +/// - The calling function must have SSSE3 available (via SSE4.1 or +/// a superset like AVX2 / AVX‑512BW). +#[inline(always)] +pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u16) { + unsafe { + // Block 0 = [R0 G0 B0 R1 G1 B1 R2 G2] — 8 `u16` = 16 bytes. + // R contributes pairs (0,1), (6,7), (12,13); G pairs (2,3), (8,9), + // (14,15); B pairs (4,5), (10,11). + let r0 = _mm_setr_epi8(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g0 = _mm_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)), + _mm_shuffle_epi8(b, b0), + ); + + // Block 1 = [B2 R3 G3 B3 R4 G4 B4 R5]. R pairs (6,7), (8,9), + // (10,11); G pairs (6,7), (8,9); B pairs (4,5), (6,7), (8,9). + let r1 = _mm_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g1 = _mm_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b1 = _mm_setr_epi8(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)), + _mm_shuffle_epi8(b, b1), + ); + + // Block 2 = [G5 B5 R6 G6 B6 R7 G7 B7]. R pairs (12,13), (14,15); + // G pairs (10,11), (12,13), (14,15); B pairs (10,11), (12,13), + // (14,15). + let r2 = _mm_setr_epi8( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g2 = _mm_setr_epi8( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b2 = _mm_setr_epi8( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)), + _mm_shuffle_epi8(b, b2), + ); + + _mm_storeu_si128(ptr.cast(), out0); + _mm_storeu_si128(ptr.add(8).cast(), out1); + _mm_storeu_si128(ptr.add(16).cast(), out2); + } +} + /// Swaps the outer two channels of 16 packed 3‑byte pixels (48 bytes /// in, 48 bytes out). Drives both BGR→RGB and RGB→BGR conversions /// since the transformation is self‑inverse. diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 3ef5988..297f1fb 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -36,16 +36,16 @@ //! `super::x86_common::write_rgb_16`. use core::arch::x86_64::{ - __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, _mm_loadl_epi64, - _mm_loadu_si128, _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16, - _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16, - _mm_unpackhi_epi16, _mm_unpacklo_epi16, + __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, + _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32, + _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, + _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, _mm_unpacklo_epi16, }; use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16}, + arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, scalar, }, }; @@ -197,6 +197,272 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( /// /// # Safety /// +/// SSE4.1 YUV 4:2:0 10‑bit → packed **8‑bit** RGB. +/// +/// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors +/// [`yuv_420_to_rgb_row`] with three structural differences: +/// - Two `_mm_loadu_si128` loads for Y (each pulls 8 `u16` = 16 bytes); +/// U/V each load 8 `u16` via one `_mm_loadu_si128`. No u8 widening — +/// the samples already occupy 16‑bit lanes. +/// - Chroma bias is 512 (10‑bit center). +/// - `range_params_n::<10, 8>` calibrates `y_scale` / `c_scale` to +/// map 10‑bit input directly to 8‑bit output in one Q15 shift. +/// +/// # Numerical contract +/// +/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`]. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + + // SAFETY: SSE4.1 availability is the caller's obligation; the + // dispatcher in `crate::row` verifies it. Pointer adds are bounded + // by the `while x + 16 <= width` loop condition and the caller‑ + // promised slice lengths. + unsafe { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // 16 Y = two `u16x8` loads; 8 U + 8 V = one load each. Each + // load is AND‑masked to the low 10 bits (see matching comment + // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]). Valid + // 10‑bit samples ≤ 1023 pass through unchanged. + let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); + let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v); + let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v); + + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma); + let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma); + let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma); + let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + let b_lo = _mm_adds_epi16(y_scaled_lo, b_dup_lo); + let b_hi = _mm_adds_epi16(y_scaled_hi, b_dup_hi); + let g_lo = _mm_adds_epi16(y_scaled_lo, g_dup_lo); + let g_hi = _mm_adds_epi16(y_scaled_hi, g_dup_hi); + let r_lo = _mm_adds_epi16(y_scaled_lo, r_dup_lo); + let r_hi = _mm_adds_epi16(y_scaled_hi, r_dup_hi); + + let b_u8 = _mm_packus_epi16(b_lo, b_hi); + let g_u8 = _mm_packus_epi16(g_lo, g_hi); + let r_u8 = _mm_packus_epi16(r_lo, r_hi); + + write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + + x += 16; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// SSE4.1 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB. +/// +/// Block size 16 Y pixels per iteration; writes two 8‑pixel u16 RGB +/// chunks via [`write_rgb_u16_8`]. Shares all pre‑write math with the +/// u8 output path; the key differences: +/// - `range_params_n::<10, 10>` → `y_scale` / `c_scale` target the +/// 10‑bit output range (values in `[0, 1023]` at Q15 exit). +/// - Clamp is explicit min/max to `[0, 1023]` — `_mm_packus_epi16` +/// would clip to u8, so we can't reuse it here. +/// +/// # Numerical contract +/// +/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`]. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0); + debug_assert!(y.len() >= width); + debug_assert!(u_half.len() >= width / 2); + debug_assert!(v_half.len() >= width / 2); + debug_assert!(rgb_out.len() >= width * 3); + + let coeffs = scalar::Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range); + let bias = scalar::chroma_bias::<10>(); + const RND: i32 = 1 << 14; + const OUT_MAX_10: i16 = 1023; + + // SAFETY: SSE4.1 availability is the caller's obligation. + unsafe { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16); + let max_v = _mm_set1_epi16(OUT_MAX_10); + let zero_v = _mm_set1_epi16(0); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + let mut x = 0usize; + while x + 16 <= width { + // AND‑mask each load to the low 10 bits — critical for the + // u16 output path since its larger `y_scale` / `c_scale` + // (32768 for 10→10 full range) would let an out‑of‑range + // sample push a `coeff * v_d` product past i16 range, + // triggering information loss in the subsequent + // `_mm_packs_epi32` narrow step inside `chroma_i16x8`. + let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); + let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); + let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v); + let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v); + + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma); + let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma); + let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma); + let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma); + + let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v); + let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v); + + // Per‑channel sum + clamp to [0, 1023]. + let r_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v); + let r_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v); + let g_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v); + let g_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v); + let b_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); + let b_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); + + // Two 8‑pixel u16 writes cover the 16‑pixel block. + write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + + x += 16; + } + + if x < width { + scalar::yuv_420p_n_to_rgb_u16_row::<10>( + &y[x..width], + &u_half[x / 2..width / 2], + &v_half[x / 2..width / 2], + &mut rgb_out[x * 3..width * 3], + width - x, + matrix, + full_range, + ); + } + } +} + +/// Clamps an i16x8 vector to `[0, max]` for the 10‑bit u16 output +/// path. `_mm_packus_epi16` would clip to u8, so we use explicit +/// min/max. +#[inline(always)] +fn clamp_u10(v: __m128i, zero_v: __m128i, max_v: __m128i) -> __m128i { + unsafe { _mm_min_epi16(_mm_max_epi16(v, zero_v), max_v) } +} + /// Same as [`nv12_or_nv21_to_rgb_row_impl`]. #[inline] #[target_feature(enable = "sse4.1")] @@ -836,4 +1102,114 @@ mod tests { check_hsv_equivalence(&rgb[..w * 3], w); } } + + // ---- yuv420p10 scalar-equivalence ----------------------------------- + + fn p10_plane(n: usize, seed: usize) -> std::vec::Vec { + (0..n) + .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16) + .collect() + } + + fn check_p10_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u8; width * 3]; + let mut rgb_simd = std::vec![0u8; width * 3]; + + scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "SSE4.1 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + fn check_p10_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let y = p10_plane(width, 37); + let u = p10_plane(width / 2, 53); + let v = p10_plane(width / 2, 71); + let mut rgb_scalar = std::vec![0u16; width * 3]; + let mut rgb_simd = std::vec![0u16; width * 3]; + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + unsafe { + yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + } + + if rgb_scalar != rgb_simd { + let first_diff = rgb_scalar + .iter() + .zip(rgb_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + panic!( + "SSE4.1 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgb_scalar[first_diff], rgb_simd[first_diff] + ); + } + } + + #[test] + fn sse41_p10_u8_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u8_sse41_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_p10_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p10_u16_sse41_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_p10_matches_scalar_odd_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_p10_u8_sse41_equivalence(w, ColorMatrix::Bt601, false); + check_p10_u16_sse41_equivalence(w, ColorMatrix::Bt709, true); + } + } + + #[test] + fn sse41_p10_matches_scalar_1920() { + check_p10_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false); + check_p10_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); + } } diff --git a/src/row/mod.rs b/src/row/mod.rs index abe4782..788789b 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -314,6 +314,190 @@ pub fn nv21_to_rgb_row( scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); } +/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 10 active bits in the low bits of each +/// element. Output is packed `R, G, B` bytes (`3 * width` bytes), +/// with the conversion clamping to `[0, 255]` — the native‑depth +/// path is [`yuv420p10_to_rgb_u16_row`]. +/// +/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv420p10_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv420p10_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv420p10_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv420p10_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv420p10_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed +/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`, +/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless +/// downstream HDR processing when the consumer expects low‑bit‑packed +/// samples. +/// +/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16` +/// elements, each in `[0, 1023]` with the upper 6 bits zero. +/// +/// This is **not** the FFmpeg `p010` layout — `p010` stores samples +/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers +/// feeding this output into a p010 consumer must shift left by 6 +/// before handing off. +/// +/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv420p10_to_rgb_u16_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv420p10_to_rgb_u16_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv420p10_to_rgb_u16_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv420p10_to_rgb_u16_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv420p10_to_rgb_u16_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. /// @@ -490,6 +674,21 @@ fn rgb_row_bytes(width: usize) -> usize { } } +/// Element count of one packed `u16`‑RGB row (`width × 3`). Identical +/// math to [`rgb_row_bytes`] — the returned value is in `u16` +/// elements, not bytes. Callers use it to size `&mut [u16]` buffers +/// for the `u16` output path. `width × 3` overflow still matters on +/// 32‑bit targets: the product names the number of elements the +/// caller allocates, and downstream SIMD kernels index with it +/// directly without re‑multiplying. +#[cfg_attr(not(tarpaulin), inline(always))] +fn rgb_row_elems(width: usize) -> usize { + match width.checked_mul(3) { + Some(n) => n, + None => panic!("width ({width}) × 3 overflows usize"), + } +} + // ---- runtime CPU feature detection ----------------------------------- // // Each `*_available` helper returns `true` iff the named feature is diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 34ab55e..9c6afa4 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -175,6 +175,233 @@ fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 } +// ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) ------------- + +/// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the +/// low `BITS` bits of each element) directly to **8-bit** packed RGB. +/// +/// `BITS` is the active input bit depth (10/12/14). Chroma bias is +/// `128 << (BITS - 8)` and the Q15 coefficients plus i32 intermediates +/// work unchanged across all three depths — only the range‑scaling +/// params ([`range_params_n`]) change with `BITS`. 16‑bit input is +/// not handled here because the i32 chroma sum would overflow. +/// +/// Output semantics match [`yuv_420_to_rgb_row`]: the final clamp is +/// to `[0, 255]`, so the scale inside [`range_params_n`] targets an +/// 8‑bit output range — the kernel sheds the extra `BITS - 8` bits of +/// source precision inline rather than converting first at `BITS` and +/// then downshifting. This keeps the fast path a single Q15 shift. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + debug_assert!(y.len() >= width, "y row too short"); + debug_assert!(u_half.len() >= width / 2, "u_half row too short"); + debug_assert!(v_half.len() >= width / 2, "v_half row too short"); + debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + + let coeffs = Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let mask = bits_mask::(); + + // Every sample is AND‑masked to the low `BITS` bits on load. This + // eliminates architecture‑dependent divergence on mispacked input + // (e.g. `p010`‑style buffers where the 10 active bits sit in the + // high bits of each `u16`): after masking, every backend sees the + // same in‑range sample, so the whole Q15 pipeline stays bounded + // (intermediate chroma sums fit i16 as designed, no saturating + // narrow loses information). For valid input every mask is a + // no‑op. For malformed input the "wrong" output is identical + // across scalar + all 5 SIMD backends. + let mut x = 0; + while x < width { + let c_idx = x / 2; + let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale); + let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale); + + let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); + let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); + let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); + + let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + rgb_out[x * 3] = clamp_u8(y0 + r_chroma); + rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); + rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + + let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); + rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); + rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); + rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + + x += 2; + } +} + +/// `(sample * scale_q15 + RND) >> 15`. With input masked to BITS, +/// the `sample * scale` product cannot overflow i32 for any +/// reasonable `OUT_BITS ≤ 16`, so plain arithmetic is sufficient. +#[cfg_attr(not(tarpaulin), inline(always))] +fn q15_scale(sample: i32, scale_q15: i32) -> i32 { + (sample * scale_q15 + (1 << 14)) >> 15 +} + +/// `(c_u * u_d + c_v * v_d + RND) >> 15`. Chroma sum max ≈ 10⁹ for +/// 14‑bit masked input, well within i32. +#[cfg_attr(not(tarpaulin), inline(always))] +fn q15_chroma(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 { + (c_u * u_d + c_v * v_d + (1 << 14)) >> 15 +} + +/// Converts one row of high‑bit‑depth 4:2:0 YUV to **`u16`** packed +/// RGB at the **input's native bit depth** (`BITS`). +/// +/// Output is **low‑bit‑packed**: for 10‑bit input each `u16` holds a +/// value in `[0, 1023]` with the upper 6 bits zero — matching +/// FFmpeg's `yuv420p10le` convention. 12‑ and 14‑bit inputs produce +/// `[0, 4095]` / `[0, 16383]` respectively, again in the low bits. +/// +/// This is **not** the FFmpeg `p010` layout: `p010` puts samples in +/// the **high** 10 bits of each `u16` (effectively `sample << 6`). +/// Callers routing this output to a p010 consumer must shift left +/// by `16 - BITS`. +/// +/// This is the fidelity‑preserving path: no bits are shed inside the +/// conversion, so the output retains the full dynamic range of the +/// source for HDR tone mapping, 10‑bit scene analysis, and similar +/// downstream work. Callers who only need 8‑bit output should prefer +/// [`yuv_420p_n_to_rgb_row`], which is ~2× faster. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + debug_assert!(y.len() >= width, "y row too short"); + debug_assert!(u_half.len() >= width / 2, "u_half row too short"); + debug_assert!(v_half.len() >= width / 2, "v_half row too short"); + debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + + let coeffs = Coefficients::for_matrix(matrix); + let (y_off, y_scale, c_scale) = range_params_n::(full_range); + let bias = chroma_bias::(); + let out_max: i32 = (1i32 << BITS) - 1; + let mask = bits_mask::(); + + // Every sample AND‑masked to the low `BITS` bits — see matching + // comment in [`yuv_420p_n_to_rgb_row`]. Critical for the native‑ + // depth u16 output path: `range_params_n::<10, 10>` uses + // `y_scale = c_scale = 32768` (unit Q15 for BITS==OUT_BITS full + // range), so an unmasked out‑of‑range sample would push `u_d` / + // `v_d` to ±32256 and the subsequent `coeff * v_d` exceeds i16 + // range — breaking the SIMD kernels' `vqmovn_s32` narrow step. + // Masking keeps every intermediate bounded by design. + let mut x = 0; + while x < width { + let c_idx = x / 2; + let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale); + let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale); + + let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); + let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); + let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); + + let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; + rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + + let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); + rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; + rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + + x += 2; + } +} + +/// Compile‑time sample mask for `BITS`: `(1 << BITS) - 1` as `u16`. +/// Returns `0x03FF` for 10‑bit, `0x0FFF` for 12‑bit, `0x3FFF` for +/// 14‑bit. SIMD backends splat this into a vector constant and AND +/// every load against it. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) const fn bits_mask() -> u16 { + ((1u32 << BITS) - 1) as u16 +} + +/// Chroma bias for input bit depth `BITS` — `128 << (BITS - 8)`. +/// 128 for 8‑bit, 512 for 10‑bit, 2048 for 12‑bit, 8192 for 14‑bit. +/// Exposed at module visibility so SIMD backends can reuse it. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) const fn chroma_bias() -> i32 { + 128i32 << (BITS - 8) +} + +/// Range‑scaling params `(y_off, y_scale_q15, c_scale_q15)` for the +/// high‑bit‑depth kernel family. +/// +/// `BITS` is the input bit depth (10 / 12 / 14); `OUT_BITS` is the +/// target output range (8 for u8‑packed RGB, equal to `BITS` for +/// native‑depth `u16` output). +/// +/// The scales are chosen so that after `((sample - y_off) * scale + RND) >> 15` +/// the result lies in `[0, (1 << OUT_BITS) - 1]` without further +/// downshifting. This keeps the fast path a single Q15 multiply for +/// both output widths. +/// +/// - Full range: luma and chroma both use the same scale, mapping +/// `[0, in_max]` to `[0, out_max]`. Same shape as 8‑bit's +/// `(0, 1<<15, 1<<15)` for `BITS == OUT_BITS`. +/// - Limited range: luma maps `[16·k, 235·k]` to `[0, out_max]`, +/// chroma maps `[16·k, 240·k]` to `[0, out_max]`, where +/// `k = 1 << (BITS - 8)`. Matches FFmpeg's `AVCOL_RANGE_MPEG` +/// semantics. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) const fn range_params_n( + full_range: bool, +) -> (i32, i32, i32) { + let in_max: i64 = (1i64 << BITS) - 1; + let out_max: i64 = (1i64 << OUT_BITS) - 1; + if full_range { + // `scale = round((out_max << 15) / in_max)`. For `BITS == OUT_BITS` + // the quotient is exactly `1 << 15` (no rounding needed); for + // 10‑bit→8‑bit it's `(255 << 15) / 1023 ≈ 8167.5`, which rounds to 8168. + let scale = ((out_max << 15) + in_max / 2) / in_max; + (0, scale as i32, scale as i32) + } else { + let y_off = 16i32 << (BITS - 8); + let y_range: i64 = 219i64 << (BITS - 8); + let c_range: i64 = 224i64 << (BITS - 8); + let y_scale = ((out_max << 15) + y_range / 2) / y_range; + let c_scale = ((out_max << 15) + c_range / 2) / c_range; + (y_off, y_scale as i32, c_scale as i32) + } +} + /// Range-scaling params: `(y_off, y_scale_q15, c_scale_q15)`. /// /// Full range: no offset, unit scales (Q15 = 2^15). @@ -620,4 +847,147 @@ mod tests { rgb_to_hsv_row(&rgb, &mut h, &mut s, &mut v, 1); assert_eq!((h[0], s[0], v[0]), (120, 255, 255)); } + + // ---- yuv_420p_n_to_rgb_row (10-bit → u8) ----------------------------- + + #[test] + fn yuv420p10_rgb_black_full_range() { + // Y=0, neutral chroma (512 in 10-bit) → black. + let y = [0u16; 4]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); + } + + #[test] + fn yuv420p10_rgb_white_full_range() { + // 10-bit full-range white is Y=1023. + let y = [1023u16; 4]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); + } + + #[test] + fn yuv420p10_rgb_gray_is_gray() { + // Mid-gray 10-bit Y=512 ↔ 8-bit 128. Within ±1 for Q15 rounding. + let y = [512u16; 4]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(128) <= 1, "got {r}"); + } + } + + #[test] + fn yuv420p10_rgb_limited_range_black_and_white() { + // 10-bit limited: Y=64 → black, Y=940 → white. + let y = [64u16, 64, 940, 940]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, false); + assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); + assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); + assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); + assert_eq!((rgb[9], rgb[10], rgb[11]), (255, 255, 255)); + } + + #[test] + fn yuv420p10_rgb_chroma_shared_across_pair() { + // Two 10-bit Y values sharing chroma: output is gray = Y>>2. + let y = [200u16, 800, 200, 800]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u8; 12]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + // Full-range 10→8 scale = 255/1023, so Y=200 → 50, Y=800 → 199.4 → 199. + // Allow ±1 for Q15 rounding. + assert!(rgb[0].abs_diff(50) <= 1, "got {}", rgb[0]); + assert!(rgb[3].abs_diff(199) <= 1, "got {}", rgb[3]); + assert!(rgb[6].abs_diff(50) <= 1, "got {}", rgb[6]); + assert!(rgb[9].abs_diff(199) <= 1, "got {}", rgb[9]); + } + + // ---- yuv_420p_n_to_rgb_u16_row (10-bit → 10-bit u16) ---------------- + + #[test] + fn yuv420p10_rgb_u16_black_full_range() { + let y = [0u16; 4]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u16; 12]; + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); + } + + #[test] + fn yuv420p10_rgb_u16_white_full_range() { + // 10-bit input Y=1023, full-range scale=1 → output Y=1023 on each channel. + let y = [1023u16; 4]; + let u = [512u16; 2]; + let v = [512u16; 2]; + let mut rgb = [0u16; 12]; + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); + } + + #[test] + fn yuv420p10_rgb_u16_limited_range_endpoints() { + // Limited-range: Y=64 → 0, Y=940 → 1023 in 10-bit output. + let y = [64u16, 940]; + let u = [512u16; 1]; + let v = [512u16; 1]; + let mut rgb = [0u16; 6]; + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 2, ColorMatrix::Bt709, false); + assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); + assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); + } + + #[test] + fn yuv420p10_rgb_u16_preserves_full_10bit_precision() { + // Sanity: the u16 path retains native-depth precision, so two + // inputs that round to the same u8 are distinguishable in u16. + // Full-range Y=200 vs Y=201: same u8 output (50 vs 50) but + // distinct u16 outputs (200 vs 201). + let y = [200u16, 201]; + let u = [512u16; 1]; + let v = [512u16; 1]; + let mut rgb8 = [0u8; 6]; + let mut rgb16 = [0u16; 6]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb8, 2, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16, 2, ColorMatrix::Bt601, true); + assert_eq!(rgb8[0], rgb8[3]); + assert_ne!(rgb16[0], rgb16[3]); + } + + #[test] + fn yuv420p10_bt709_ycgco_differ_for_chroma() { + // Non-neutral chroma — different matrices produce different RGB. + let y = [512u16; 2]; + let u = [512u16; 1]; + let v = [800u16; 1]; + let mut bt709 = [0u8; 6]; + let mut ycgco = [0u8; 6]; + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut bt709, 2, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut ycgco, 2, ColorMatrix::YCgCo, true); + let sad: i32 = bt709 + .iter() + .zip(ycgco.iter()) + .map(|(a, b)| (*a as i32 - *b as i32).abs()) + .sum(); + assert!( + sad > 20, + "matrices should materially differ: {bt709:?} vs {ycgco:?}" + ); + } } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 3ad5ace..432994e 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -18,8 +18,14 @@ use thiserror::Error; use crate::{ HsvBuffers, PixelSink, SourceFormat, - row::{nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row}, - yuv::{Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420pRow, Yuv420pSink}, + row::{ + nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row, + yuv420p10_to_rgb_u16_row, + }, + yuv::{ + Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420p10, Yuv420p10Row, + Yuv420p10Sink, Yuv420pRow, Yuv420pSink, + }, }; /// Errors returned by [`MixedSinker`] configuration and per-frame @@ -58,6 +64,19 @@ pub enum MixedSinkerError { actual: usize, }, + /// `u16` RGB buffer attached via [`MixedSinker::with_rgb_u16`] / + /// [`MixedSinker::set_rgb_u16`] is shorter than `width × height × 3` + /// `u16` elements. Only the high‑bit‑depth source impls + /// (currently [`Yuv420p10`](crate::yuv::Yuv420p10)) write into this + /// buffer. + #[error("MixedSinker rgb_u16 buffer too short: expected >= {expected} elements, got {actual}")] + RgbU16BufferTooShort { + /// Minimum `u16` elements required (`width × height × 3`). + expected: usize, + /// `u16` elements supplied. + actual: usize, + }, + /// Luma buffer is shorter than `width × height`. #[error("MixedSinker luma buffer too short: expected >= {expected} bytes, got {actual}")] LumaBufferTooShort { @@ -97,17 +116,24 @@ pub enum MixedSinkerError { /// direct `process` callers that bypass the walker (hand-crafted /// rows, replayed rows, etc.) before a wrong-shaped slice reaches /// an unsafe SIMD kernel. + /// + /// Lengths are expressed in **slice elements** — `u8` bytes for + /// the 8‑bit source rows (Y, U/V half, UV/VU half) and `u16` + /// elements for the 10‑bit source rows (Y10, U/V half 10). The + /// message deliberately says "elements" rather than "bytes" so the + /// same variant can serve both the `u8` and `u16` row families. #[error( - "MixedSinker row shape mismatch at row {row}: {which} slice has {actual} bytes, expected {expected}" + "MixedSinker row shape mismatch at row {row}: {which} slice has {actual} elements, expected {expected}" )] RowShapeMismatch { /// Which slice mismatched. See [`RowSlice`] for variants. which: RowSlice, /// Row index reported by the offending row. row: usize, - /// Expected slice length in bytes (given the sink's configured width). + /// Expected slice length in elements of the slice's element type + /// (`u8` for 8‑bit source rows; `u16` for 10‑bit source rows). expected: usize, - /// Actual slice length supplied by the row. + /// Actual slice length in the same unit as `expected`. actual: usize, }, @@ -181,6 +207,18 @@ pub enum RowSlice { /// pairs — byte order swapped relative to [`Self::UvHalf`]. #[display("VU Half")] VuHalf, + /// Full‑width Y row of a **10‑bit** planar source ([`Yuv420p10`]). + /// `u16` samples, `width` elements. + #[display("Y10")] + Y10, + /// Half‑width U row of a **10‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("U Half 10")] + UHalf10, + /// Half‑width V row of a **10‑bit** planar source. `u16` samples, + /// `width / 2` elements. + #[display("V Half 10")] + VHalf10, } /// A sink that writes any subset of `{RGB, Luma, HSV}` into @@ -206,6 +244,7 @@ pub enum RowSlice { /// [`Nv21`](crate::yuv::Nv21). pub struct MixedSinker<'a, F: SourceFormat> { rgb: Option<&'a mut [u8]>, + rgb_u16: Option<&'a mut [u16]>, luma: Option<&'a mut [u8]>, hsv: Option>, width: usize, @@ -231,6 +270,7 @@ impl MixedSinker<'_, F> { pub fn new(width: usize, height: usize) -> Self { Self { rgb: None, + rgb_u16: None, luma: None, hsv: None, width, @@ -241,12 +281,22 @@ impl MixedSinker<'_, F> { } } - /// Returns `true` iff the sinker will write RGB. + /// Returns `true` iff the sinker will write 8‑bit RGB. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn produces_rgb(&self) -> bool { self.rgb.is_some() } + /// Returns `true` iff the sinker will write `u16` RGB at the + /// source's native bit depth. Only high‑bit‑depth source impls + /// (currently [`Yuv420p10`](crate::yuv::Yuv420p10)) honor this + /// buffer — attaching it on an 8‑bit source format is legal but + /// no writes occur. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn produces_rgb_u16(&self) -> bool { + self.rgb_u16.is_some() + } + /// Returns `true` iff the sinker will write luma. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn produces_luma(&self) -> bool { @@ -338,6 +388,14 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> { Ok(self) } + // NOTE: `with_rgb_u16` / `set_rgb_u16` are **not** declared here. + // They live on a format‑specific impl block further down (currently + // [`MixedSinker`]) so the buffer can only be attached to + // sink types whose `PixelSink` impl actually writes it. Attaching a + // `u16` RGB buffer to a [`Yuv420p`] / [`Nv12`] / [`Nv21`] sink is a + // compile error, not a silent stale‑state bug. Future high‑bit‑depth + // markers (12‑bit, 14‑bit, P010) will add their own impl blocks. + /// Attaches a single-plane luma output buffer. /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`, /// or `Err(GeometryOverflow)` on 32‑bit overflow. @@ -835,6 +893,221 @@ impl PixelSink for MixedSinker<'_, Nv21> { } } +// ---- Yuv420p10 impl ----------------------------------------------------- + +impl<'a> MixedSinker<'a, Yuv420p10> { + /// Attaches a packed **`u16`** RGB output buffer. Only available on + /// sinkers whose source format populates native‑depth `u16` RGB — + /// calling `with_rgb_u16` on an 8‑bit source sinker (e.g. + /// [`MixedSinker`]) is a compile error rather than a + /// silent no‑op that would leave the caller's buffer stale. + /// + /// Length is measured in `u16` **elements** (not bytes): minimum + /// `width × height × 3`. Each element carries a 10‑bit value in + /// the **low** 10 bits (upper 6 bits zero), matching FFmpeg's + /// `yuv420p10le` convention. This is **not** the `p010` layout + /// (which stores samples in the high 10 bits); callers feeding a + /// p010 consumer must shift the output left by 6. + /// + /// Returns `Err(RgbU16BufferTooShort)` if + /// `buf.len() < width × height × 3`, or `Err(GeometryOverflow)` + /// on 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgb_u16(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). The + /// required length is measured in `u16` **elements**, not bytes. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + // Packed RGB requires `width × height × 3` channel values — + // that's the same count whether the element type is `u8` or + // `u16`, so the [`Self::frame_bytes`] helper (named for the u8 + // RGB path's byte count) gives the element count here too. No + // size conversion needed. + let expected_elements = self.frame_bytes(3)?; + if buf.len() < expected_elements { + return Err(MixedSinkerError::RgbU16BufferTooShort { + expected: expected_elements, + actual: buf.len(), + }); + } + self.rgb_u16 = Some(buf); + Ok(self) + } +} + +impl Yuv420p10Sink for MixedSinker<'_, Yuv420p10> {} + +impl PixelSink for MixedSinker<'_, Yuv420p10> { + type Input<'r> = Yuv420p10Row<'r>; + type Error = MixedSinkerError; + + fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> { + if self.width & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: self.width }); + } + check_dimensions_match(self.width, self.height, width, height) + } + + fn process(&mut self, row: Yuv420p10Row<'_>) -> Result<(), Self::Error> { + // Bit depth is fixed by the format (10) — declared as a const so + // the downshift for u8 luma stays obvious at the call site. + const BITS: u32 = 10; + + let w = self.width; + let h = self.height; + let idx = row.row(); + let use_simd = self.simd; + + // Defense in depth — see the [`Yuv420p`] impl for the rationale. + // Row slice checks use the 10‑bit variants of [`RowSlice`] so + // downstream log output disambiguates from the 8‑bit source impls. + if w & 1 != 0 { + return Err(MixedSinkerError::OddWidth { width: w }); + } + if row.y().len() != w { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::Y10, + row: idx, + expected: w, + actual: row.y().len(), + }); + } + if row.u_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::UHalf10, + row: idx, + expected: w / 2, + actual: row.u_half().len(), + }); + } + if row.v_half().len() != w / 2 { + return Err(MixedSinkerError::RowShapeMismatch { + which: RowSlice::VHalf10, + row: idx, + expected: w / 2, + actual: row.v_half().len(), + }); + } + if idx >= self.height { + return Err(MixedSinkerError::RowIndexOutOfRange { + row: idx, + configured_height: self.height, + }); + } + + let Self { + rgb, + rgb_u16, + luma, + hsv, + rgb_scratch, + .. + } = self; + + let one_plane_start = idx * w; + let one_plane_end = one_plane_start + w; + + // Luma: downshift 10‑bit Y to 8‑bit for the existing u8 luma + // buffer contract. Bit‑extension by `(BITS - 8)` preserves the + // most significant bits — functionally equivalent to FFmpeg's + // `>> (BITS - 8)` conversion used by many downstream analyses. + if let Some(luma) = luma.as_deref_mut() { + let dst = &mut luma[one_plane_start..one_plane_end]; + for (d, &s) in dst.iter_mut().zip(row.y().iter()) { + *d = (s >> (BITS - 8)) as u8; + } + } + + // `u16` RGB output — written directly via the native‑depth row + // primitive. Computed independently of the u8 path: the two + // outputs have different scale params inside `range_params_n`, + // so they can't share an intermediate without losing precision. + if let Some(buf) = rgb_u16.as_deref_mut() { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + yuv420p10_to_rgb_u16_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgb_plane_start..rgb_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + + let want_rgb = rgb.is_some(); + let want_hsv = hsv.is_some(); + if !want_rgb && !want_hsv { + return Ok(()); + } + + // 8‑bit RGB path — either writes to the caller's buffer (when + // `with_rgb` is set) or to the lazily‑grown scratch (when HSV is + // requested without RGB). Mirrors the 8‑bit source impls' layout. + let rgb_row: &mut [u8] = match rgb.as_deref_mut() { + Some(buf) => { + let rgb_plane_end = + one_plane_end + .checked_mul(3) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + let rgb_plane_start = one_plane_start * 3; + &mut buf[rgb_plane_start..rgb_plane_end] + } + None => { + let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 3, + })?; + if rgb_scratch.len() < rgb_row_bytes { + rgb_scratch.resize(rgb_row_bytes, 0); + } + &mut rgb_scratch[..rgb_row_bytes] + } + }; + + yuv420p10_to_rgb_row( + row.y(), + row.u_half(), + row.v_half(), + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + + if let Some(hsv) = hsv.as_mut() { + rgb_to_hsv_row( + rgb_row, + &mut hsv.h[one_plane_start..one_plane_end], + &mut hsv.s[one_plane_start..one_plane_end], + &mut hsv.v[one_plane_start..one_plane_end], + w, + use_simd, + ); + } + Ok(()) + } +} + /// Returns `Ok(())` iff the walker's frame dimensions exactly match /// the sinker's configured dimensions. Called from /// [`PixelSink::begin_frame`] on both `MixedSinker` and @@ -872,8 +1145,8 @@ mod tests { use super::*; use crate::{ ColorMatrix, - frame::{Nv12Frame, Nv21Frame, Yuv420pFrame}, - yuv::{nv12_to, nv21_to, yuv420p_to}, + frame::{Nv12Frame, Nv21Frame, Yuv420p10Frame, Yuv420pFrame}, + yuv::{nv12_to, nv21_to, yuv420p_to, yuv420p10_to}, }; fn solid_yuv420p_frame( @@ -1739,4 +2012,162 @@ mod tests { assert_eq!(rgb_nv12, rgb_nv21); } + + // ---- Yuv420p10 -------------------------------------------------------- + + fn solid_yuv420p10_frame( + width: u32, + height: u32, + y: u16, + u: u16, + v: u16, + ) -> (Vec, Vec, Vec) { + let w = width as usize; + let h = height as usize; + let cw = w / 2; + let ch = h / 2; + ( + std::vec![y; w * h], + std::vec![u; cw * ch], + std::vec![v; cw * ch], + ) + } + + #[test] + fn yuv420p10_rgb_u8_only_gray_is_gray() { + // 10-bit mid-gray: Y=512, UV=512 → 8-bit RGB ≈ 128 on every channel. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + } + + #[test] + fn yuv420p10_rgb_u16_only_native_depth_gray() { + // Same mid-gray frame → u16 RGB output in native 10-bit depth, so + // each channel should be ≈ 512 (the 10-bit mid). + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + // Upper 6 bits of each u16 must be zero — 10-bit convention. + assert!(px[0] <= 1023); + } + } + + #[test] + fn yuv420p10_rgb_u8_and_u16_both_populated() { + // 10-bit full-range white: Y=1023, UV=512. Both buffers should + // fill with their respective "white" values (255 for u8, 1023 for + // u16) in the same call. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 1023, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(rgb_u8.iter().all(|&c| c == 255)); + assert!(rgb_u16.iter().all(|&c| c == 1023)); + } + + #[test] + fn yuv420p10_luma_downshifts_to_8bit() { + // Y=512 at 10 bits → 512 >> 2 = 128 at 8 bits. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut luma = std::vec![0u8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_luma(&mut luma) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(luma.iter().all(|&l| l == 128)); + } + + #[test] + fn yuv420p10_hsv_from_gray_is_zero_hue_zero_sat() { + // HSV derived from the internal u8 RGB scratch: neutral gray → + // H=0, S=0, V≈128. Exercises the "HSV without RGB" scratch path + // on the 10-bit source. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut h = std::vec![0xFFu8; 16 * 8]; + let mut s = std::vec![0xFFu8; 16 * 8]; + let mut v = std::vec![0xFFu8; 16 * 8]; + let mut sink = MixedSinker::::new(16, 8) + .with_hsv(&mut h, &mut s, &mut v) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + assert!(h.iter().all(|&b| b == 0)); + assert!(s.iter().all(|&b| b == 0)); + assert!(v.iter().all(|&b| b.abs_diff(128) <= 1)); + } + + #[test] + fn yuv420p10_rgb_u16_too_short_returns_err() { + let mut rgb = std::vec![0u16; 10]; // Way too small. + let err = MixedSinker::::new(16, 8) + .with_rgb_u16(&mut rgb) + .err() + .unwrap(); + assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. })); + } + + #[test] + fn yuv420p10_with_simd_false_matches_with_simd_true() { + // The SIMD toggle exercises scalar-vs-SIMD dispatch. Both paths + // must produce byte-identical results on both outputs. + let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3]; + let mut s_scalar = MixedSinker::::new(64, 16) + .with_simd(false) + .with_rgb(&mut rgb_scalar) + .unwrap() + .with_rgb_u16(&mut rgb_u16_scalar) + .unwrap(); + yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap(); + + let mut rgb_simd = std::vec![0u8; 64 * 16 * 3]; + let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3]; + let mut s_simd = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_simd) + .unwrap() + .with_rgb_u16(&mut rgb_u16_simd) + .unwrap(); + yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap(); + + assert_eq!(rgb_scalar, rgb_simd); + assert_eq!(rgb_u16_scalar, rgb_u16_simd); + } } diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs index ac2647e..655b706 100644 --- a/src/yuv/mod.rs +++ b/src/yuv/mod.rs @@ -8,13 +8,17 @@ //! default). //! - [`Nv21`](crate::yuv::Nv21) — 4:2:0 semi‑planar with **VU**-ordered //! chroma (Android MediaCodec default). +//! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits +//! per sample (HDR10 / 10‑bit SDR software decode). //! //! Other families land in follow-up commits. mod nv12; mod nv21; mod yuv420p; +mod yuv420p10; pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to}; pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to}; pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to}; +pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to}; diff --git a/src/yuv/yuv420p10.rs b/src/yuv/yuv420p10.rs new file mode 100644 index 0000000..1a85e06 --- /dev/null +++ b/src/yuv/yuv420p10.rs @@ -0,0 +1,170 @@ +//! YUV 4:2:0 planar 10‑bit (`AV_PIX_FMT_YUV420P10LE`). +//! +//! Storage mirrors [`super::Yuv420p`] — three planes, Y at full size +//! plus U / V at half width and half height — but sample width is +//! **`u16`** (10 active bits in the low bits of each element). The +//! [`Yuv420p10Frame`] type alias pins the bit depth; the underlying +//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` so 12‑bit +//! and 14‑bit variants can be added by relaxing its validator without +//! changing kernel math. +//! +//! Ships in colconv v0.2 as the first high‑bit‑depth format (HDR / +//! 10‑bit SDR keystone). Kernel semantics match [`super::Yuv420p`]: +//! two consecutive Y rows share one chroma row (4:2:0), chroma is +//! nearest‑neighbor upsampled in registers inside the row primitive. + +use crate::{ + ColorMatrix, PixelSink, SourceFormat, + frame::{Yuv420p10Frame, Yuv420pFrame16}, + sealed::Sealed, +}; + +/// Zero‑sized marker for the YUV 4:2:0 **10‑bit** source format. Used +/// as the `F` type parameter on [`crate::sinker::MixedSinker`]. +/// +/// colconv v0.2 ships only the 10‑bit specialization; 12‑ and 14‑bit +/// will arrive as separate markers (`Yuv420p12`, `Yuv420p14`) that +/// refer to the same underlying [`Yuv420pFrame16`] struct with +/// different `BITS` values. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] +pub struct Yuv420p10; + +impl Sealed for Yuv420p10 {} +impl SourceFormat for Yuv420p10 {} + +/// One output row of a 10‑bit YUV 4:2:0 source handed to a +/// [`Yuv420p10Sink`]. Structurally identical to [`super::Yuv420pRow`], +/// just `u16` samples. +#[derive(Debug, Clone, Copy)] +pub struct Yuv420p10Row<'a> { + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, +} + +impl<'a> Yuv420p10Row<'a> { + /// Bundles one row of a 10‑bit 4:2:0 source for a [`Yuv420p10Sink`]. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + y: &'a [u16], + u_half: &'a [u16], + v_half: &'a [u16], + row: usize, + matrix: ColorMatrix, + full_range: bool, + ) -> Self { + Self { + y, + u_half, + v_half, + row, + matrix, + full_range, + } + } + + /// Full‑width Y (luma) row — `width` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn y(&self) -> &'a [u16] { + self.y + } + + /// Half‑width U (Cb) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn u_half(&self) -> &'a [u16] { + self.u_half + } + + /// Half‑width V (Cr) row — `width / 2` `u16` samples. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn v_half(&self) -> &'a [u16] { + self.v_half + } + + /// Output row index within the frame. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn row(&self) -> usize { + self.row + } + + /// YUV → RGB matrix carried through from the kernel call. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn matrix(&self) -> ColorMatrix { + self.matrix + } + + /// `true` iff Y uses the full sample range (`[0, 1023]` for 10‑bit); + /// `false` for limited range (`[64, 940]` luma, `[64, 960]` chroma). + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn full_range(&self) -> bool { + self.full_range + } +} + +/// Sinks that consume 10‑bit YUV 4:2:0 rows. +pub trait Yuv420p10Sink: for<'a> PixelSink = Yuv420p10Row<'a>> {} + +/// Converts a 10‑bit YUV 4:2:0 frame by walking its rows and feeding +/// each one to the [`Yuv420p10Sink`]. See [`super::yuv420p_to`] for +/// the shared design rationale — kernel is a pure row walker, all +/// color arithmetic happens inside the Sink via the crate's row +/// primitives. +pub fn yuv420p10_to( + src: &Yuv420p10Frame<'_>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + // `BITS` is pinned at the const generic (10) so the walker body + // can be monomorphized per bit depth later; the row and sink types + // themselves are still 10‑bit only (`Yuv420p10Row` / `Yuv420p10Sink`). + // 12‑ and 14‑bit support will add their own marker / row / sink + // trios plus per‑depth walker entry points. + yuv420p10_walker::<10, S>(src, full_range, matrix, sink) +} + +/// Row walker for the 10‑bit YUV 4:2:0 source. `BITS` is a const +/// generic so [`Yuv420pFrame16`] geometry reads (stride, plane +/// slicing) are monomorphized; the row/sink types bound below are +/// still pinned to the 10‑bit variants — 12 / 14 will grow their own +/// walkers alongside their own marker types. +#[cfg_attr(not(tarpaulin), inline(always))] +fn yuv420p10_walker( + src: &Yuv420pFrame16<'_, BITS>, + full_range: bool, + matrix: ColorMatrix, + sink: &mut S, +) -> Result<(), S::Error> { + sink.begin_frame(src.width(), src.height())?; + + let w = src.width() as usize; + let h = src.height() as usize; + let y_stride = src.y_stride() as usize; + let u_stride = src.u_stride() as usize; + let v_stride = src.v_stride() as usize; + let chroma_width = w / 2; + + let y_plane = src.y(); + let u_plane = src.u(); + let v_plane = src.v(); + + for row in 0..h { + let y_start = row * y_stride; + let y = &y_plane[y_start..y_start + w]; + + let chroma_row = row / 2; + let u_start = chroma_row * u_stride; + let v_start = chroma_row * v_stride; + let u_half = &u_plane[u_start..u_start + chroma_width]; + let v_half = &v_plane[v_start..v_start + chroma_width]; + + sink.process(Yuv420p10Row::new( + y, u_half, v_half, row, matrix, full_range, + ))?; + } + Ok(()) +}