diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0120375..d0fc00b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -200,6 +200,52 @@ jobs:
           export CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="$SDE_PATH/sde64 -icx --"
           cargo test --all-features
 
+  # Run the wasm32 simd128 lib tests under wasmtime. The `cross` job
+  # above only builds for wasm targets; without this job the
+  # `wasm_simd128` backend's handcrafted swizzles / clamps / u16
+  # stores were dispatchable in production (under
+  # `-C target-feature=+simd128`) but never runtime‑verified. This job
+  # runs every scalar‑equivalence test — including the new yuv420p10
+  # u8 / u16 output paths and the adversarial out‑of‑range regressions
+  # — against an actual wasm runtime.
+  #
+  # `wasm32-wasip1` is the wasi preview‑1 target (libstd + file/env
+  # APIs that the test harness needs). Criterion is gated out of the
+  # wasm dev‑deps in Cargo.toml because rayon doesn't build for wasi.
+  test-wasm-simd128:
+    name: test-wasm-simd128
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Cache cargo build and registry
+        uses: actions/cache@v5
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-test-wasm-simd128-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-test-wasm-simd128-
+      - name: Install Rust + wasm32-wasip1
+        run: |
+          rustup update stable --no-self-update
+          rustup default stable
+          rustup target add wasm32-wasip1
+      - name: Install wasmtime
+        run: |
+          curl https://wasmtime.dev/install.sh -sSf | bash
+          echo "$HOME/.wasmtime/bin" >> "$GITHUB_PATH"
+      - name: Run lib tests under wasmtime (simd128)
+        env:
+          # `cargo test` hands the compiled `.wasm` test binary as the
+          # first positional arg after `--`; wasmtime's `run --`
+          # interprets that as the module path. We don't need filesystem
+          # or env access — the tests are pure compute.
+          CARGO_TARGET_WASM32_WASIP1_RUNNER: wasmtime run --
+          RUSTFLAGS: -C target-feature=+simd128
+        run: cargo test --lib --target wasm32-wasip1
+
   sanitizer:
     name: sanitizer
     runs-on: ubuntu-latest
diff --git a/Cargo.toml b/Cargo.toml
index b13c252..2f34a10 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,10 @@ harness = false
 name = "nv21_to_rgb"
 harness = false
 
+[[bench]]
+name = "yuv_420p10_to_rgb"
+harness = false
+
 [[bench]]
 name = "rgb_to_hsv"
 harness = false
@@ -43,9 +47,15 @@ thiserror = { version = "2", default-features = false }
 libm = { version = "0.2", optional = true }
 
 [dev-dependencies]
-criterion = "0.8"
 tempfile = "3"
 
+# Criterion pulls in rayon, which doesn't build for the wasm32‑wasi*
+# targets we use to run the simd128 backends under wasmtime. Gate it
+# to non‑wasm hosts — benches never run on wasm anyway (they need
+# system threading / timing that the wasi runner doesn't expose).
+[target.'cfg(not(target_family = "wasm"))'.dev-dependencies]
+criterion = "0.8"
+
 [profile.bench]
 opt-level = 3
 debug = false
diff --git a/benches/yuv_420p10_to_rgb.rs b/benches/yuv_420p10_to_rgb.rs
new file mode 100644
index 0000000..3659f65
--- /dev/null
+++ b/benches/yuv_420p10_to_rgb.rs
@@ -0,0 +1,108 @@
+//! Per‑row YUV 4:2:0 10‑bit → packed RGB throughput baseline.
+//!
+//! Three variants per width:
+//! - `u8_simd` / `u8_scalar` — native‑SIMD vs scalar on the u8 output
+//!   path (analogous to the 8‑bit bench).
+//! - `u16_simd` / `u16_scalar` — same pair for the native‑depth u16
+//!   output path. The u16 path writes 2× the bytes so the MB/s
+//!   figure is comparable only within the u16 column.
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use std::hint::black_box;
+
+use colconv::{
+  ColorMatrix,
+  row::{yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row},
+};
+
+/// Fills a `u16` buffer with a deterministic 10‑bit pseudo‑random
+/// sequence — values occupy the low 10 bits of each `u16`, matching
+/// the storage layout of `yuv420p10le`.
+fn fill_pseudo_random_u16(buf: &mut [u16], seed: u32) {
+  let mut state = seed;
+  for b in buf {
+    state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+    *b = ((state >> 8) & 0x3FF) as u16;
+  }
+}
+
+fn bench(c: &mut Criterion) {
+  // 720p / 1080p / 4K — multiples of 64 so the widest backend
+  // (AVX‑512, 64 pixels per iteration) covers each fully without tail
+  // work. Avoids skewing comparisons across targets.
+  const WIDTHS: &[usize] = &[1280, 1920, 3840];
+  const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl;
+  const FULL_RANGE: bool = false;
+
+  // ---- u8 output ------------------------------------------------------
+  let mut group_u8 = c.benchmark_group("yuv420p10_to_rgb_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u8; w * 3];
+
+    group_u8.throughput(Throughput::Bytes((w * 3) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u8_simd" } else { "u8_scalar" };
+      group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p10_to_rgb_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u8.finish();
+
+  // ---- u16 native-depth output ----------------------------------------
+  let mut group_u16 = c.benchmark_group("yuv420p10_to_rgb_u16_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut u = std::vec![0u16; w / 2];
+    let mut v = std::vec![0u16; w / 2];
+    fill_pseudo_random_u16(&mut y, 0x1111);
+    fill_pseudo_random_u16(&mut u, 0x2222);
+    fill_pseudo_random_u16(&mut v, 0x3333);
+    let mut rgb = std::vec![0u16; w * 3];
+
+    // u16 output writes 2× the bytes of u8.
+    group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u16_simd" } else { "u16_scalar" };
+      group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          yuv420p10_to_rgb_u16_row(
+            black_box(&y),
+            black_box(&u),
+            black_box(&v),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u16.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/src/frame.rs b/src/frame.rs
index b5e96f6..585d59f 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -5,7 +5,7 @@
 //! validates strides vs. widths and that each plane covers its
 //! declared area.
 
-use derive_more::IsVariant;
+use derive_more::{Display, IsVariant};
 use thiserror::Error;
 
 /// A validated YUV 4:2:0 planar frame.
@@ -686,6 +686,490 @@ pub enum Nv21FrameError {
   },
 }
 
+/// A validated YUV 4:2:0 planar frame at bit depths > 8 (10/12/14).
+///
+/// Structurally identical to [`Yuv420pFrame`] — three planes, half‑
+/// size chroma — but sample storage is **`u16`** so every pixel
+/// carries up to 16 bits of payload. `BITS` is the active bit depth
+/// (10, 12, or 14). Callers are **expected** to store each sample in
+/// the **low** `BITS` bits of its `u16` (upper `16 - BITS` bits zero),
+/// matching FFmpeg's little‑endian `yuv420p10le` / `yuv420p12le` /
+/// `yuv420p14le` convention, where each plane is a byte buffer
+/// reinterpretable as `u16` little‑endian. `try_new` validates plane
+/// geometry / strides / lengths but does **not** inspect sample
+/// values to verify this packing.
+///
+/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
+/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
+/// holding a p010 buffer must shift right by `16 - BITS` before
+/// construction.
+///
+/// # Input sample range
+///
+/// The kernels assume every input sample is in `[0, (1 << BITS) - 1]`
+/// — i.e., upper `16 - BITS` bits zero. Validating this at
+/// construction would require scanning every sample of every plane
+/// (megabytes per frame at video rates); instead the constructor
+/// validates geometry only and the contract falls on the caller.
+/// Decoders and FFmpeg output satisfy this by construction.
+///
+/// **Output for out‑of‑range samples is equivalent to pre‑masking
+/// every sample to the low `BITS` bits.** Every kernel (scalar + all
+/// 5 SIMD tiers) AND‑masks each `u16` load to `(1 << BITS) - 1`
+/// before the Q15 path, so a sample like `0xFFC0` (p010 white =
+/// `1023 << 6`) is treated identically to `0x03C0` on every backend
+/// when `BITS == 10`. This gives deterministic, backend‑independent
+/// output for mispacked input — feeding `p010` data into a
+/// `yuv420p10le`‑shaped frame produces severely distorted, but stable,
+/// pixel values across scalar / NEON / SSE4.1 / AVX2 / AVX‑512 /
+/// wasm simd128, which is an obvious signal for downstream diffing.
+/// The mask is a single AND per load and a no‑op on valid input
+/// (upper bits already zero).
+///
+/// Callers who want the mispacking to surface as a loud error
+/// instead of silent color corruption should use
+/// [`Self::try_new_checked`] — it scans every sample and returns
+/// [`Yuv420pFrame16Error::SampleOutOfRange`] on the first violation.
+///
+/// colconv v0.2 ships `BITS == 10` only (the use‑case keystone for
+/// HDR and 10‑bit SDR). 12 and 14 are mechanical follow‑ups that
+/// just relax the constructor's `BITS` check and add tiered aliases
+/// — the kernel math (Q15 coefficients + i32 intermediates) works
+/// unchanged across all three, derived at compile time from `BITS`.
+///
+/// 16‑bit input (which would overflow the i32 chroma sum in the
+/// Q15 path) is **not** represented by this type — it needs a
+/// separate kernel family with i64 intermediates or a lower Q
+/// coefficient format. That lands in a later ship.
+///
+/// Stride is in **samples** (`u16` elements), not bytes. Users
+/// holding a byte buffer from FFmpeg should cast via
+/// [`bytemuck::cast_slice`] and divide `linesize[i]` by 2 before
+/// constructing.
+///
+/// `width` must be even (same 4:2:0 rationale as [`Yuv420pFrame`]);
+/// `height` may be odd and is handled via `height.div_ceil(2)` in
+/// chroma‑row sizing.
+#[derive(Debug, Clone, Copy)]
+pub struct Yuv420pFrame16<'a, const BITS: u32> {
+  y: &'a [u16],
+  u: &'a [u16],
+  v: &'a [u16],
+  width: u32,
+  height: u32,
+  y_stride: u32,
+  u_stride: u32,
+  v_stride: u32,
+}
+
+impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> {
+  /// Constructs a new [`Yuv420pFrame16`], validating dimensions, plane
+  /// lengths, and the `BITS` parameter.
+  ///
+  /// Returns [`Yuv420pFrame16Error`] if any of:
+  /// - `BITS` is not 10, 12, or 14 (colconv v0.2 additionally rejects
+  ///   12/14 at the type alias layer — see [`Yuv420p10Frame`]),
+  /// - `width` or `height` is zero,
+  /// - `width` is odd,
+  /// - any stride is smaller than the plane's declared pixel width,
+  /// - any plane is too short to cover its declared rows, or
+  /// - `stride * rows` overflows `usize` (32‑bit targets only).
+  ///
+  /// All strides are in **samples** (`u16` elements).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub const fn try_new(
+    y: &'a [u16],
+    u: &'a [u16],
+    v: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    u_stride: u32,
+    v_stride: u32,
+  ) -> Result<Self, Yuv420pFrame16Error> {
+    // Guard the `BITS` parameter at the top so users who accidentally
+    // monomorphize on e.g. `BITS == 8` (which would work numerically
+    // but should go through [`Yuv420pFrame`] instead) or `BITS == 16`
+    // (which would overflow the i32 chroma sum in the Q15 kernel)
+    // get a clear error rather than silently wrong output.
+    if BITS != 10 && BITS != 12 && BITS != 14 {
+      return Err(Yuv420pFrame16Error::UnsupportedBits { bits: BITS });
+    }
+    if width == 0 || height == 0 {
+      return Err(Yuv420pFrame16Error::ZeroDimension { width, height });
+    }
+    if width & 1 != 0 {
+      return Err(Yuv420pFrame16Error::OddWidth { width });
+    }
+    if y_stride < width {
+      return Err(Yuv420pFrame16Error::YStrideTooSmall { width, y_stride });
+    }
+    let chroma_width = width.div_ceil(2);
+    if u_stride < chroma_width {
+      return Err(Yuv420pFrame16Error::UStrideTooSmall {
+        chroma_width,
+        u_stride,
+      });
+    }
+    if v_stride < chroma_width {
+      return Err(Yuv420pFrame16Error::VStrideTooSmall {
+        chroma_width,
+        v_stride,
+      });
+    }
+
+    // Plane sizes are in `u16` elements, so the overflow guard runs
+    // against the sample count — callers converting from byte strides
+    // should have already divided by 2.
+    let y_min = match (y_stride as usize).checked_mul(height as usize) {
+      Some(v) => v,
+      None => {
+        return Err(Yuv420pFrame16Error::GeometryOverflow {
+          stride: y_stride,
+          rows: height,
+        });
+      }
+    };
+    if y.len() < y_min {
+      return Err(Yuv420pFrame16Error::YPlaneTooShort {
+        expected: y_min,
+        actual: y.len(),
+      });
+    }
+    let chroma_height = height.div_ceil(2);
+    let u_min = match (u_stride as usize).checked_mul(chroma_height as usize) {
+      Some(v) => v,
+      None => {
+        return Err(Yuv420pFrame16Error::GeometryOverflow {
+          stride: u_stride,
+          rows: chroma_height,
+        });
+      }
+    };
+    if u.len() < u_min {
+      return Err(Yuv420pFrame16Error::UPlaneTooShort {
+        expected: u_min,
+        actual: u.len(),
+      });
+    }
+    let v_min = match (v_stride as usize).checked_mul(chroma_height as usize) {
+      Some(v) => v,
+      None => {
+        return Err(Yuv420pFrame16Error::GeometryOverflow {
+          stride: v_stride,
+          rows: chroma_height,
+        });
+      }
+    };
+    if v.len() < v_min {
+      return Err(Yuv420pFrame16Error::VPlaneTooShort {
+        expected: v_min,
+        actual: v.len(),
+      });
+    }
+
+    Ok(Self {
+      y,
+      u,
+      v,
+      width,
+      height,
+      y_stride,
+      u_stride,
+      v_stride,
+    })
+  }
+
+  /// Constructs a new [`Yuv420pFrame16`], panicking on invalid inputs.
+  /// Prefer [`Self::try_new`] when inputs may be invalid at runtime.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub const fn new(
+    y: &'a [u16],
+    u: &'a [u16],
+    v: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    u_stride: u32,
+    v_stride: u32,
+  ) -> Self {
+    match Self::try_new(y, u, v, width, height, y_stride, u_stride, v_stride) {
+      Ok(frame) => frame,
+      Err(_) => panic!("invalid Yuv420pFrame16 dimensions or plane lengths"),
+    }
+  }
+
+  /// Like [`Self::try_new`] but additionally scans every sample of
+  /// every plane and rejects values above `(1 << BITS) - 1`. Use this
+  /// on untrusted input (e.g., a `u16` buffer of unknown provenance
+  /// that might be `p010`‑packed or otherwise dirty) where accepting
+  /// out-of-range samples would be unacceptable because they violate
+  /// the expected bit-depth contract and can produce invalid results.
+  ///
+  /// Cost: one O(plane_size) linear scan per plane — a few megabytes
+  /// per 1080p frame at 10 bits. The default [`Self::try_new`] skips
+  /// this so the hot path (decoder output, already-conforming
+  /// buffers) stays O(1).
+  ///
+  /// Returns [`Yuv420pFrame16Error::SampleOutOfRange`] on the first
+  /// offending sample — the error carries the plane, element index
+  /// within that plane's slice, offending value, and the valid
+  /// maximum so the caller can pinpoint the bad sample. All of
+  /// [`Self::try_new`]'s geometry errors are still possible.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub fn try_new_checked(
+    y: &'a [u16],
+    u: &'a [u16],
+    v: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    u_stride: u32,
+    v_stride: u32,
+  ) -> Result<Self, Yuv420pFrame16Error> {
+    let frame = Self::try_new(y, u, v, width, height, y_stride, u_stride, v_stride)?;
+    let max_valid: u16 = ((1u32 << BITS) - 1) as u16;
+    // Scan the declared-payload region of each plane. Stride may add
+    // unused padding past the declared width; we don't inspect that —
+    // callers often pass buffers whose padding bytes are arbitrary,
+    // and the kernels never read them.
+    let w = width as usize;
+    let h = height as usize;
+    let chroma_w = w / 2;
+    let chroma_h = height.div_ceil(2) as usize;
+    for row in 0..h {
+      let start = row * y_stride as usize;
+      for (col, &s) in y[start..start + w].iter().enumerate() {
+        if s > max_valid {
+          return Err(Yuv420pFrame16Error::SampleOutOfRange {
+            plane: Yuv420pFrame16Plane::Y,
+            index: start + col,
+            value: s,
+            max_valid,
+          });
+        }
+      }
+    }
+    for row in 0..chroma_h {
+      let start = row * u_stride as usize;
+      for (col, &s) in u[start..start + chroma_w].iter().enumerate() {
+        if s > max_valid {
+          return Err(Yuv420pFrame16Error::SampleOutOfRange {
+            plane: Yuv420pFrame16Plane::U,
+            index: start + col,
+            value: s,
+            max_valid,
+          });
+        }
+      }
+    }
+    for row in 0..chroma_h {
+      let start = row * v_stride as usize;
+      for (col, &s) in v[start..start + chroma_w].iter().enumerate() {
+        if s > max_valid {
+          return Err(Yuv420pFrame16Error::SampleOutOfRange {
+            plane: Yuv420pFrame16Plane::V,
+            index: start + col,
+            value: s,
+            max_valid,
+          });
+        }
+      }
+    }
+    Ok(frame)
+  }
+
+  /// Y (luma) plane samples. Row `r` starts at sample offset
+  /// `r * y_stride()`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// U (Cb) plane samples. Row `r` starts at sample offset
+  /// `r * u_stride()`. U has half the width and half the height of the
+  /// frame (chroma row index for output row `r` is `r / 2`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn u(&self) -> &'a [u16] {
+    self.u
+  }
+
+  /// V (Cr) plane samples. Row `r` starts at sample offset
+  /// `r * v_stride()`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn v(&self) -> &'a [u16] {
+    self.v
+  }
+
+  /// Frame width in pixels. Always even.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn width(&self) -> u32 {
+    self.width
+  }
+
+  /// Frame height in pixels.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn height(&self) -> u32 {
+    self.height
+  }
+
+  /// Sample stride of the Y plane (`>= width`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn y_stride(&self) -> u32 {
+    self.y_stride
+  }
+
+  /// Sample stride of the U plane (`>= width / 2`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn u_stride(&self) -> u32 {
+    self.u_stride
+  }
+
+  /// Sample stride of the V plane (`>= width / 2`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn v_stride(&self) -> u32 {
+    self.v_stride
+  }
+
+  /// Active bit depth — 10, 12, or 14. Mirrors the `BITS` const
+  /// parameter so generic code can read it without naming the type.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn bits(&self) -> u32 {
+    BITS
+  }
+}
+
+/// Type alias for a validated YUV 4:2:0 planar frame at 10 bits per
+/// sample (`AV_PIX_FMT_YUV420P10LE`). Tight wrapper over
+/// [`Yuv420pFrame16`] with `BITS == 10` — use this name at call sites
+/// for readability.
+pub type Yuv420p10Frame<'a> = Yuv420pFrame16<'a, 10>;
+
+/// Errors returned by [`Yuv420pFrame16::try_new`]. Variant shape
+/// mirrors [`Yuv420pFrameError`], with `UnsupportedBits` added for
+/// the new `BITS` parameter and all sizes expressed in **samples**
+/// (`u16` elements) instead of bytes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)]
+#[non_exhaustive]
+pub enum Yuv420pFrame16Error {
+  /// `BITS` was not one of the supported depths (10, 12, 14). 8‑bit
+  /// frames should use [`Yuv420pFrame`]; 16‑bit needs a separate
+  /// kernel family (see [`Yuv420pFrame16`] docs).
+  #[error("unsupported BITS ({bits}) for Yuv420pFrame16; must be 10, 12, or 14")]
+  UnsupportedBits {
+    /// The unsupported value of the `BITS` const parameter.
+    bits: u32,
+  },
+  /// `width` or `height` was zero.
+  #[error("width ({width}) or height ({height}) is zero")]
+  ZeroDimension {
+    /// The supplied width.
+    width: u32,
+    /// The supplied height.
+    height: u32,
+  },
+  /// `width` was odd. Same 4:2:0 rationale as
+  /// [`Yuv420pFrameError::OddWidth`].
+  #[error("width ({width}) is odd; YUV420p / 4:2:0 requires even width")]
+  OddWidth {
+    /// The supplied width.
+    width: u32,
+  },
+  /// `y_stride < width` (in samples).
+  #[error("y_stride ({y_stride}) is smaller than width ({width})")]
+  YStrideTooSmall {
+    /// Declared frame width in pixels.
+    width: u32,
+    /// The supplied Y‑plane stride (samples).
+    y_stride: u32,
+  },
+  /// `u_stride < ceil(width / 2)` (in samples).
+  #[error("u_stride ({u_stride}) is smaller than chroma width ({chroma_width})")]
+  UStrideTooSmall {
+    /// Required minimum chroma‑plane stride.
+    chroma_width: u32,
+    /// The supplied U‑plane stride (samples).
+    u_stride: u32,
+  },
+  /// `v_stride < ceil(width / 2)` (in samples).
+  #[error("v_stride ({v_stride}) is smaller than chroma width ({chroma_width})")]
+  VStrideTooSmall {
+    /// Required minimum chroma‑plane stride.
+    chroma_width: u32,
+    /// The supplied V‑plane stride (samples).
+    v_stride: u32,
+  },
+  /// Y plane is shorter than `y_stride * height` samples.
+  #[error("Y plane has {actual} samples but at least {expected} are required")]
+  YPlaneTooShort {
+    /// Minimum samples required.
+    expected: usize,
+    /// Actual samples supplied.
+    actual: usize,
+  },
+  /// U plane is shorter than `u_stride * ceil(height / 2)` samples.
+  #[error("U plane has {actual} samples but at least {expected} are required")]
+  UPlaneTooShort {
+    /// Minimum samples required.
+    expected: usize,
+    /// Actual samples supplied.
+    actual: usize,
+  },
+  /// V plane is shorter than `v_stride * ceil(height / 2)` samples.
+  #[error("V plane has {actual} samples but at least {expected} are required")]
+  VPlaneTooShort {
+    /// Minimum samples required.
+    expected: usize,
+    /// Actual samples supplied.
+    actual: usize,
+  },
+  /// `stride * rows` overflows `usize` (32‑bit targets only).
+  #[error("declared geometry overflows usize: stride={stride} * rows={rows}")]
+  GeometryOverflow {
+    /// Stride of the plane whose size overflowed.
+    stride: u32,
+    /// Row count that overflowed against the stride.
+    rows: u32,
+  },
+  /// A plane sample exceeds `(1 << BITS) - 1` — i.e., a bit above the
+  /// declared active depth is set. Only [`Yuv420pFrame16::try_new_checked`]
+  /// can produce this error; [`Yuv420pFrame16::try_new`] validates
+  /// geometry only and treats the low‑bit‑packing contract as an
+  /// expectation. Use the checked constructor for untrusted input
+  /// (e.g., a buffer that might be `p010`‑packed instead of
+  /// `yuv420p10le`‑packed).
+  #[error(
+    "sample {value} on plane {plane} at element {index} exceeds {max_valid} ((1 << BITS) - 1)"
+  )]
+  SampleOutOfRange {
+    /// Which plane the offending sample lives on.
+    plane: Yuv420pFrame16Plane,
+    /// Element index within that plane's slice. This is the raw
+    /// `&[u16]` index — it accounts for stride padding rows, so
+    /// `index / stride` is the row, `index % stride` is the
+    /// in‑row position.
+    index: usize,
+    /// The offending sample value.
+    value: u16,
+    /// The maximum allowed value for this `BITS` (`(1 << BITS) - 1`).
+    max_valid: u16,
+  },
+}
+
+/// Identifies which plane of a [`Yuv420pFrame16`] an error refers to.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
+pub enum Yuv420pFrame16Plane {
+  /// Luma plane.
+  Y,
+  /// U (Cb) chroma plane.
+  U,
+  /// V (Cr) chroma plane.
+  V,
+}
+
 /// Errors returned by [`Yuv420pFrame::try_new`].
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)]
 #[non_exhaustive]
@@ -1057,4 +1541,227 @@ mod tests {
     let e = Nv21Frame::try_new(&y, &vu, big, big, big, big).unwrap_err();
     assert!(matches!(e, Nv21FrameError::GeometryOverflow { .. }));
   }
+
+  // ---- Yuv420pFrame16 / Yuv420p10Frame ----------------------------------
+  //
+  // Storage is `&[u16]` with sample-indexed strides. Validation mirrors
+  // the 8-bit [`Yuv420pFrame`] with the addition of the `BITS` guard.
+
+  fn p10_planes() -> (std::vec::Vec<u16>, std::vec::Vec<u16>, std::vec::Vec<u16>) {
+    // 16×8 frame, chroma 8×4. Y plane solid black (Y=0); UV planes
+    // neutral (UV=512 = 10‑bit chroma center). Exact sample values
+    // don't matter for the constructor tests that use this helper —
+    // they only look at shape, geometry errors, and the reported
+    // bits.
+    (
+      std::vec![0u16; 16 * 8],
+      std::vec![512u16; 8 * 4],
+      std::vec![512u16; 8 * 4],
+    )
+  }
+
+  #[test]
+  fn yuv420p10_try_new_accepts_valid_tight() {
+    let (y, u, v) = p10_planes();
+    let f = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("valid");
+    assert_eq!(f.width(), 16);
+    assert_eq!(f.height(), 8);
+    assert_eq!(f.bits(), 10);
+  }
+
+  #[test]
+  fn yuv420p10_try_new_accepts_odd_height() {
+    // 16x9 → chroma_height = 5. Y plane 16*9 = 144 samples, U/V 8*5 = 40.
+    let y = std::vec![0u16; 16 * 9];
+    let u = std::vec![512u16; 8 * 5];
+    let v = std::vec![512u16; 8 * 5];
+    let f = Yuv420p10Frame::try_new(&y, &u, &v, 16, 9, 16, 8, 8).expect("odd height valid");
+    assert_eq!(f.height(), 9);
+  }
+
+  #[test]
+  fn yuv420p10_try_new_rejects_odd_width() {
+    let (y, u, v) = p10_planes();
+    let e = Yuv420p10Frame::try_new(&y, &u, &v, 15, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::OddWidth { width: 15 }));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_rejects_zero_dim() {
+    let (y, u, v) = p10_planes();
+    let e = Yuv420p10Frame::try_new(&y, &u, &v, 0, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::ZeroDimension { .. }));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_rejects_short_y_plane() {
+    let y = std::vec![0u16; 10];
+    let u = std::vec![512u16; 8 * 4];
+    let v = std::vec![512u16; 8 * 4];
+    let e = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. }));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_rejects_short_u_plane() {
+    let y = std::vec![0u16; 16 * 8];
+    let u = std::vec![512u16; 4];
+    let v = std::vec![512u16; 8 * 4];
+    let e = Yuv420p10Frame::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::UPlaneTooShort { .. }));
+  }
+
+  #[test]
+  fn yuv420p16_try_new_rejects_unsupported_bits() {
+    // BITS == 9 is not in {10, 12, 14}; the constructor must reject it
+    // before any plane math runs. This also exercises the 12/14 path
+    // at the validation layer even though Ship 2 only ships a 10-bit
+    // alias.
+    let y = std::vec![0u16; 16 * 8];
+    let u = std::vec![128u16; 8 * 4];
+    let v = std::vec![128u16; 8 * 4];
+    let e = Yuv420pFrame16::<9>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(
+      e,
+      Yuv420pFrame16Error::UnsupportedBits { bits: 9 }
+    ));
+
+    let e16 = Yuv420pFrame16::<16>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(
+      e16,
+      Yuv420pFrame16Error::UnsupportedBits { bits: 16 }
+    ));
+  }
+
+  #[test]
+  fn yuv420p16_try_new_accepts_12_and_14() {
+    // The constructor admits 12 and 14 — Ship 2 doesn't ship kernels
+    // for them but the geometry validator shouldn't block the types.
+    let y = std::vec![0u16; 16 * 8];
+    let u = std::vec![2048u16; 8 * 4];
+    let v = std::vec![2048u16; 8 * 4];
+    let f12 = Yuv420pFrame16::<12>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("12-bit valid");
+    assert_eq!(f12.bits(), 12);
+    let f14 = Yuv420pFrame16::<14>::try_new(&y, &u, &v, 16, 8, 16, 8, 8).expect("14-bit valid");
+    assert_eq!(f14.bits(), 14);
+  }
+
+  #[test]
+  #[should_panic(expected = "invalid Yuv420pFrame16")]
+  fn yuv420p10_new_panics_on_invalid() {
+    let y = std::vec![0u16; 10];
+    let u = std::vec![512u16; 8 * 4];
+    let v = std::vec![512u16; 8 * 4];
+    let _ = Yuv420p10Frame::new(&y, &u, &v, 16, 8, 16, 8, 8);
+  }
+
+  #[cfg(target_pointer_width = "32")]
+  #[test]
+  fn yuv420p10_try_new_rejects_geometry_overflow() {
+    // Sample count overflow on 32-bit. Same rationale as the 8-bit
+    // version — strides are in `u16` elements here, so the same
+    // `0x1_0000 * 0x1_0000` product overflows `usize`.
+    let big: u32 = 0x1_0000;
+    let y: [u16; 0] = [];
+    let u: [u16; 0] = [];
+    let v: [u16; 0] = [];
+    let e = Yuv420p10Frame::try_new(&y, &u, &v, big, big, big, big / 2, big / 2).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::GeometryOverflow { .. }));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_accepts_in_range_samples() {
+    // Same valid frame as `yuv420p10_try_new_accepts_valid_tight`,
+    // but run through the checked constructor. All samples live in
+    // the 10‑bit range.
+    let (y, u, v) = p10_planes();
+    let f = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).expect("valid");
+    assert_eq!(f.width(), 16);
+    assert_eq!(f.bits(), 10);
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_rejects_y_high_bit_set() {
+    // A Y sample with bit 15 set — typical of `p010` packing where
+    // the 10 active bits sit in the high bits. `try_new` would
+    // accept this and let the SIMD kernels produce arch‑dependent
+    // garbage; `try_new_checked` catches it up front.
+    let mut y = std::vec![0u16; 16 * 8];
+    y[3 * 16 + 5] = 0x8000; // bit 15 set → way above 1023
+    let u = std::vec![512u16; 8 * 4];
+    let v = std::vec![512u16; 8 * 4];
+    let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    match e {
+      Yuv420pFrame16Error::SampleOutOfRange {
+        plane,
+        value,
+        max_valid,
+        ..
+      } => {
+        assert_eq!(plane, Yuv420pFrame16Plane::Y);
+        assert_eq!(value, 0x8000);
+        assert_eq!(max_valid, 1023);
+      }
+      other => panic!("expected SampleOutOfRange, got {other:?}"),
+    }
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_rejects_u_plane_sample() {
+    // Offending sample in the U plane — error must name U, not Y or V.
+    let y = std::vec![0u16; 16 * 8];
+    let mut u = std::vec![512u16; 8 * 4];
+    u[2 * 8 + 3] = 1024; // just above max
+    let v = std::vec![512u16; 8 * 4];
+    let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(
+      e,
+      Yuv420pFrame16Error::SampleOutOfRange {
+        plane: Yuv420pFrame16Plane::U,
+        value: 1024,
+        max_valid: 1023,
+        ..
+      }
+    ));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_rejects_v_plane_sample() {
+    let y = std::vec![0u16; 16 * 8];
+    let u = std::vec![512u16; 8 * 4];
+    let mut v = std::vec![512u16; 8 * 4];
+    v[1 * 8 + 7] = 0xFFFF; // all bits set
+    let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(
+      e,
+      Yuv420pFrame16Error::SampleOutOfRange {
+        plane: Yuv420pFrame16Plane::V,
+        max_valid: 1023,
+        ..
+      }
+    ));
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_accepts_exact_max_sample() {
+    // Boundary: sample value == (1 << BITS) - 1 is valid.
+    let mut y = std::vec![0u16; 16 * 8];
+    y[0] = 1023;
+    let u = std::vec![512u16; 8 * 4];
+    let v = std::vec![512u16; 8 * 4];
+    Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).expect("1023 is in range");
+  }
+
+  #[test]
+  fn yuv420p10_try_new_checked_reports_geometry_errors_first() {
+    // If geometry is invalid, we never get to the sample scan — the
+    // same errors as `try_new` surface first. Prevents the checked
+    // path from doing unnecessary O(N) work on inputs that would
+    // fail for a simpler reason.
+    let y = std::vec![0u16; 10]; // Too small.
+    let u = std::vec![512u16; 8 * 4];
+    let v = std::vec![512u16; 8 * 4];
+    let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
+    assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. }));
+  }
 }
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index c8fb3e9..5cefb9e 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -33,13 +33,14 @@
 //! 8. Saturate‑narrow to u8x16 and interleave with `vst3q_u8`.
 
 use core::arch::aarch64::{
-  float32x4_t, int16x8_t, int32x4_t, uint8x16_t, uint8x16x3_t, vaddq_f32, vaddq_s32, vbslq_f32,
-  vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16, vcvtq_f32_u32, vcvtq_u32_f32,
-  vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vget_high_s16, vget_high_u8, vget_high_u16,
-  vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8, vld1q_u8, vld2_u8, vld3q_u8, vmaxq_f32,
-  vminq_f32, vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32,
-  vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vshrq_n_s32, vst1q_u8, vst3q_u8,
-  vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
+  float32x4_t, int16x8_t, int32x4_t, uint8x16_t, uint8x16x3_t, uint16x8_t, uint16x8x3_t, vaddq_f32,
+  vaddq_s32, vandq_u16, vbslq_f32, vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16,
+  vcvtq_f32_u32, vcvtq_u32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_u16,
+  vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8,
+  vld1q_u8, vld1q_u16, vld2_u8, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16,
+  vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, vqaddq_s16,
+  vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, vst1q_u8,
+  vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
 };
 
 use crate::{ColorMatrix, row::scalar};
@@ -189,6 +190,304 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
+/// NEON YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+///
+/// Block size is 16 Y pixels / 8 chroma pairs per iteration. The
+/// pipeline mirrors [`yuv_420_to_rgb_row`] byte‑for‑byte; the only
+/// structural differences are:
+/// - Loads are `vld1q_u16` (8 lanes of `u16`) instead of `vld1q_u8`
+///   (16 lanes of `u8`), so each Y iteration needs two Y loads to
+///   cover 16 pixels — there's no widening step because the samples
+///   already live in 16‑bit lanes.
+/// - Chroma bias is **512** (10‑bit center) rather than 128.
+/// - Range‑scaling params come from [`scalar::range_params_n`] with
+///   `BITS = 10, OUT_BITS = 8`, so `y_scale` / `c_scale` are ~¼ the
+///   8‑bit values (mapping 10‑bit input to 8‑bit output).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`] — every
+/// Q15 multiply / shift mirrors the scalar path exactly, with the
+/// same `(prod + (1 << 14)) >> 15` rounding.
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: NEON availability is the caller's obligation; the
+  // dispatcher in `crate::row` verifies it. Pointer adds are bounded
+  // by the `while x + 16 <= width` loop condition and the caller‑
+  // promised slice lengths checked above.
+  unsafe {
+    let rnd_v = vdupq_n_s32(RND);
+    let y_off_v = vdupq_n_s16(y_off as i16);
+    let y_scale_v = vdupq_n_s32(y_scale);
+    let c_scale_v = vdupq_n_s32(c_scale);
+    let bias_v = vdupq_n_s16(bias as i16);
+    let mask_v = vdupq_n_u16(scalar::bits_mask::<10>());
+    let cru = vdupq_n_s32(coeffs.r_u());
+    let crv = vdupq_n_s32(coeffs.r_v());
+    let cgu = vdupq_n_s32(coeffs.g_u());
+    let cgv = vdupq_n_s32(coeffs.g_v());
+    let cbu = vdupq_n_s32(coeffs.b_u());
+    let cbv = vdupq_n_s32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // Two Y loads cover 16 lanes; one U load + one V load cover 8
+      // chroma each. Each load is AND‑masked to the low 10 bits so
+      // out‑of‑range samples (e.g. `p010`‑style packing with the
+      // 10 active bits in the high 10 of each u16) can never push
+      // an intermediate past i16 range. For valid input the AND is
+      // a no‑op (samples already in [0, 1023]).
+      let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v);
+      let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v);
+      let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v);
+      let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v);
+
+      let y_lo = vreinterpretq_s16_u16(y_vec_lo);
+      let y_hi = vreinterpretq_s16_u16(y_vec_hi);
+
+      // c - 512 for 10‑bit chroma, fits i16 since c ≤ 1023.
+      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+      // Widen to i32x4 halves so the Q15 multiplies don't overflow.
+      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      // Duplicate the 8 chroma lanes into 16‑lane pairs — identical
+      // nearest‑neighbor upsample strategy as the 8‑bit kernel.
+      let r_dup_lo = vzip1q_s16(r_chroma, r_chroma);
+      let r_dup_hi = vzip2q_s16(r_chroma, r_chroma);
+      let g_dup_lo = vzip1q_s16(g_chroma, g_chroma);
+      let g_dup_hi = vzip2q_s16(g_chroma, g_chroma);
+      let b_dup_lo = vzip1q_s16(b_chroma, b_chroma);
+      let b_dup_hi = vzip2q_s16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v);
+
+      // u8 output: saturate‑narrow i16 → u8 clamps to [0, 255].
+      let b_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, b_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)),
+      );
+      let g_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, g_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, g_dup_hi)),
+      );
+      let r_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, r_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
+      );
+
+      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+
+      x += 16;
+    }
+
+    // Scalar tail — remaining < 16 pixels (always even per 4:2:0).
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// NEON YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB (native depth).
+///
+/// Block size is 16 Y pixels / 8 chroma pairs per iteration. Shares
+/// all pre‑write math with [`yuv420p10_to_rgb_row`]; the only
+/// difference is the final clamp + write:
+/// - Y‑path scale is calibrated for `OUT_BITS = 10` rather than 8,
+///   so `y_scaled` lives in `[0, 1023]` before the chroma add.
+/// - The `y_scaled + chroma` sum is clamped to `[0, 1023]` with
+///   `vmaxq_s16(vminq_s16(_, 1023), 0)` — a simple saturate‑narrow
+///   doesn't suffice because the sum can overshoot 1023 (up to ~2046
+///   without saturating at i16 bounds).
+/// - Writes use two `vst3q_u16` calls per iteration — each handles 8
+///   pixels × 3 channels = 24 `u16` elements, so two cover 16 pixels.
+///
+/// # Numerical contract
+///
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`] — every
+/// Q15 multiply / shift / clamp mirrors the scalar reference.
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: NEON availability is the caller's obligation; the
+  // dispatcher in `crate::row` verifies it. Pointer adds are bounded
+  // by the `while x + 16 <= width` loop condition and the caller‑
+  // promised slice lengths.
+  unsafe {
+    let rnd_v = vdupq_n_s32(RND);
+    let y_off_v = vdupq_n_s16(y_off as i16);
+    let y_scale_v = vdupq_n_s32(y_scale);
+    let c_scale_v = vdupq_n_s32(c_scale);
+    let bias_v = vdupq_n_s16(bias as i16);
+    let mask_v = vdupq_n_u16(scalar::bits_mask::<10>());
+    let max_v = vdupq_n_s16(OUT_MAX_10);
+    let zero_v = vdupq_n_s16(0);
+    let cru = vdupq_n_s32(coeffs.r_u());
+    let crv = vdupq_n_s32(coeffs.r_v());
+    let cgu = vdupq_n_s32(coeffs.g_u());
+    let cgv = vdupq_n_s32(coeffs.g_v());
+    let cbu = vdupq_n_s32(coeffs.b_u());
+    let cbv = vdupq_n_s32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // AND‑mask each load to the low 10 bits so intermediates stay
+      // within the i16 range the Q15 narrow steps expect — see
+      // matching comment in [`yuv420p10_to_rgb_row`].
+      let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v);
+      let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v);
+      let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v);
+      let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v);
+
+      let y_lo = vreinterpretq_s16_u16(y_vec_lo);
+      let y_hi = vreinterpretq_s16_u16(y_vec_hi);
+
+      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = vzip1q_s16(r_chroma, r_chroma);
+      let r_dup_hi = vzip2q_s16(r_chroma, r_chroma);
+      let g_dup_lo = vzip1q_s16(g_chroma, g_chroma);
+      let g_dup_hi = vzip2q_s16(g_chroma, g_chroma);
+      let b_dup_lo = vzip1q_s16(b_chroma, b_chroma);
+      let b_dup_hi = vzip2q_s16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v);
+
+      // Native‑depth output: add Y + chroma in i16, then clamp to
+      // [0, 1023] explicitly. `vqaddq_s16` saturates at i16 bounds
+      // (irrelevant here since |sum| < 2047 always), so the subsequent
+      // max/min clamps to the 10‑bit range.
+      let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10(vqaddq_s16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels.
+      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
+      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Clamps an i16x8 vector to `[0, max]` and reinterprets to u16x8.
+/// Used by the 10‑bit u16 output path to avoid `vqmovun_s16`'s u8
+/// saturation.
+#[inline(always)]
+fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t {
+  unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) }
+}
+
 /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the
 /// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1124,4 +1423,200 @@ mod tests {
 
     assert_eq!(input, back, "swap is not self-inverse");
   }
+
+  // ---- yuv420p10 scalar-equivalence -----------------------------------
+
+  /// Deterministic pseudo‑random `u16` samples in `[0, 1023]` — the
+  /// 10‑bit range. Upper 6 bits always zero, so the generator matches
+  /// real `yuv420p10le` bit patterns.
+  fn p10_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16)
+      .collect()
+  }
+
+  fn check_p10_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_neon = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_neon {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "NEON 10→u8 diverges from scalar at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgb_scalar[first_diff], rgb_neon[first_diff]
+      );
+    }
+  }
+
+  fn check_p10_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_neon = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_neon {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "NEON 10→u16 diverges from scalar at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgb_scalar[first_diff], rgb_neon[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn neon_p10_u8_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u8_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p10_u16_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u16_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p10_matches_scalar_odd_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p10_u8_equivalence(w, ColorMatrix::Bt601, false);
+      check_p10_u16_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn neon_p10_matches_scalar_1920() {
+    check_p10_u8_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p10_u16_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
+
+  /// Out‑of‑range regression: every kernel AND‑masks each `u16` load
+  /// to the low `BITS` bits, so **arbitrary** upper‑bit corruption
+  /// (not just p010 packing) produces scalar/NEON bit‑identical
+  /// output. This test sweeps three adversarial input shapes:
+  ///
+  /// - `p010`: 10 active bits in the high 10 of each `u16`
+  ///   (`sample << 6`) — the canonical mispacking mistake.
+  /// - `ycgco_worst`: `Y=[0x8000; W]`, `U=[0; W/2]`, `V=[0x8000; W/2]`
+  ///   — the specific Codex‑identified case that used to produce
+  ///   `(1023, 0, 0)` on scalar vs `(0, 0, 0)` on NEON before the
+  ///   load‑time mask was added.
+  /// - `random`: arbitrary upper‑bit flips with no particular pattern.
+  ///
+  /// Each variant runs through every color matrix × range × both
+  /// output paths (u8 + native‑depth u16) and asserts byte equality.
+  #[test]
+  fn neon_p10_matches_scalar_on_out_of_range_samples() {
+    let width = 32;
+
+    let p010_variant =
+      |i: usize, seed: u16| 0xFC00u16.wrapping_add(((i as u16).wrapping_mul(seed)) << 6);
+    let random_variant = |i: usize, seed: u16| {
+      let x = (i as u32)
+        .wrapping_mul(seed as u32)
+        .wrapping_add(0xDEAD_BEEF) as u16;
+      x ^ 0xA5A5
+    };
+
+    for variant_name in ["p010", "ycgco_worst", "random"] {
+      let y: std::vec::Vec<u16> = match variant_name {
+        "ycgco_worst" => std::vec![0x8000u16; width],
+        "p010" => (0..width).map(|i| p010_variant(i, 37)).collect(),
+        _ => (0..width).map(|i| random_variant(i, 37)).collect(),
+      };
+      let u: std::vec::Vec<u16> = match variant_name {
+        "ycgco_worst" => std::vec![0x0u16; width / 2],
+        "p010" => (0..width / 2).map(|i| p010_variant(i, 53)).collect(),
+        _ => (0..width / 2).map(|i| random_variant(i, 53)).collect(),
+      };
+      let v: std::vec::Vec<u16> = match variant_name {
+        "ycgco_worst" => std::vec![0x8000u16; width / 2],
+        "p010" => (0..width / 2).map(|i| p010_variant(i, 71)).collect(),
+        _ => (0..width / 2).map(|i| random_variant(i, 71)).collect(),
+      };
+
+      for matrix in [ColorMatrix::Bt601, ColorMatrix::Bt709, ColorMatrix::YCgCo] {
+        for full_range in [true, false] {
+          let mut rgb_scalar = std::vec![0u8; width * 3];
+          let mut rgb_neon = std::vec![0u8; width * 3];
+          scalar::yuv_420p_n_to_rgb_row::<10>(
+            &y,
+            &u,
+            &v,
+            &mut rgb_scalar,
+            width,
+            matrix,
+            full_range,
+          );
+          unsafe {
+            yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range);
+          }
+          assert_eq!(
+            rgb_scalar, rgb_neon,
+            "scalar and NEON diverge on {variant_name} input (matrix={matrix:?}, full_range={full_range})"
+          );
+
+          let mut rgb16_scalar = std::vec![0u16; width * 3];
+          let mut rgb16_neon = std::vec![0u16; width * 3];
+          scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+            &y,
+            &u,
+            &v,
+            &mut rgb16_scalar,
+            width,
+            matrix,
+            full_range,
+          );
+          unsafe {
+            yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range);
+          }
+          assert_eq!(
+            rgb16_scalar, rgb16_neon,
+            "scalar and NEON diverge on {variant_name} u16 output (matrix={matrix:?}, full_range={full_range})"
+          );
+        }
+      }
+    }
+  }
 }
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 4a092b9..8b0175c 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -37,11 +37,12 @@
 
 use core::arch::wasm32::{
   f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_eq, f32x4_lt, f32x4_max, f32x4_min, f32x4_mul,
-  f32x4_splat, f32x4_sub, i8x16, i8x16_shuffle, i16x8_add_sat, i16x8_narrow_i32x4, i16x8_splat,
-  i16x8_sub, i32x4_add, i32x4_extend_high_i16x8, i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr,
-  i32x4_splat, i32x4_trunc_sat_f32x4, u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16,
-  u16x8_extend_low_u8x16, u16x8_load_extend_u8x8, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8,
-  v128, v128_bitselect, v128_load, v128_or, v128_store,
+  f32x4_splat, f32x4_sub, i8x16, i8x16_shuffle, i16x8_add_sat, i16x8_max, i16x8_min,
+  i16x8_narrow_i32x4, i16x8_splat, i16x8_sub, i32x4_add, i32x4_extend_high_i16x8,
+  i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr, i32x4_splat, i32x4_trunc_sat_f32x4,
+  u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16, u16x8_extend_low_u8x16,
+  u16x8_load_extend_u8x8, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, v128,
+  v128_and, v128_bitselect, v128_load, v128_or, v128_store,
 };
 
 use crate::{ColorMatrix, row::scalar};
@@ -188,6 +189,317 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
+/// WASM simd128 YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+///
+/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences
+/// from [`yuv_420_to_rgb_row`]:
+/// - Y loads are two `v128_load` (each holds 8 `u16` = 16 bytes); U / V
+///   each one `v128_load` (8 `u16`).
+/// - No u8→u16 widening — samples already in 16‑bit lanes.
+/// - Chroma bias 512 (10‑bit center).
+/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift.
+///
+/// Reuses [`chroma_i16x8`], [`dup_lo`], [`dup_hi`], [`scale_y`], and
+/// [`write_rgb_16`].
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: simd128 compile‑time availability is the caller's
+  // obligation.
+  unsafe {
+    let rnd_v = i32x4_splat(RND);
+    let y_off_v = i16x8_splat(y_off as i16);
+    let y_scale_v = i32x4_splat(y_scale);
+    let c_scale_v = i32x4_splat(c_scale);
+    let bias_v = i16x8_splat(bias as i16);
+    let mask_v = u16x8_splat(scalar::bits_mask::<10>());
+    let cru = i32x4_splat(coeffs.r_u());
+    let crv = i32x4_splat(coeffs.r_v());
+    let cgu = i32x4_splat(coeffs.g_u());
+    let cgv = i32x4_splat(coeffs.g_v());
+    let cbu = i32x4_splat(coeffs.b_u());
+    let cbv = i32x4_splat(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // AND‑mask each load to the low 10 bits — see matching comment
+      // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`].
+      let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v);
+      let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v);
+      let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v);
+
+      let u_i16 = i16x8_sub(u_vec, bias_v);
+      let v_i16 = i16x8_sub(v_vec, bias_v);
+
+      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = dup_lo(r_chroma);
+      let r_dup_hi = dup_hi(r_chroma);
+      let g_dup_lo = dup_lo(g_chroma);
+      let g_dup_hi = dup_hi(g_chroma);
+      let b_dup_lo = dup_lo(b_chroma);
+      let b_dup_hi = dup_hi(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = i16x8_add_sat(y_scaled_lo, b_dup_lo);
+      let b_hi = i16x8_add_sat(y_scaled_hi, b_dup_hi);
+      let g_lo = i16x8_add_sat(y_scaled_lo, g_dup_lo);
+      let g_hi = i16x8_add_sat(y_scaled_hi, g_dup_hi);
+      let r_lo = i16x8_add_sat(y_scaled_lo, r_dup_lo);
+      let r_hi = i16x8_add_sat(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
+      let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
+      let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
+
+      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// WASM simd128 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB.
+///
+/// Block 16 Y pixels. Mirrors [`yuv420p10_to_rgb_row`]'s pre‑write
+/// math; output uses explicit `i16x8_min` / `i16x8_max` clamp to
+/// `[0, 1023]` and two calls to [`write_rgb_u16_8`] per block.
+///
+/// # Numerical contract
+///
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: simd128 compile‑time availability is the caller's
+  // obligation.
+  unsafe {
+    let rnd_v = i32x4_splat(RND);
+    let y_off_v = i16x8_splat(y_off as i16);
+    let y_scale_v = i32x4_splat(y_scale);
+    let c_scale_v = i32x4_splat(c_scale);
+    let bias_v = i16x8_splat(bias as i16);
+    let mask_v = u16x8_splat(scalar::bits_mask::<10>());
+    let max_v = i16x8_splat(OUT_MAX_10);
+    let zero_v = i16x8_splat(0);
+    let cru = i32x4_splat(coeffs.r_u());
+    let crv = i32x4_splat(coeffs.r_v());
+    let cgu = i32x4_splat(coeffs.g_u());
+    let cgv = i32x4_splat(coeffs.g_v());
+    let cbu = i32x4_splat(coeffs.b_u());
+    let cbv = i32x4_splat(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // AND‑mask loads to the low 10 bits so `chroma_i16x8`'s
+      // `i16x8_narrow_i32x4` stays lossless.
+      let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v);
+      let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v);
+      let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v);
+
+      let u_i16 = i16x8_sub(u_vec, bias_v);
+      let v_i16 = i16x8_sub(v_vec, bias_v);
+
+      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = dup_lo(r_chroma);
+      let r_dup_hi = dup_hi(r_chroma);
+      let g_dup_lo = dup_lo(g_chroma);
+      let g_dup_hi = dup_hi(g_chroma);
+      let b_dup_lo = dup_lo(b_chroma);
+      let b_dup_hi = dup_hi(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let r_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Clamps an i16x8 vector to `[0, max]`. Used by the 10‑bit u16
+/// output path.
+#[inline(always)]
+fn clamp_u10_wasm(v: v128, zero_v: v128, max_v: v128) -> v128 {
+  i16x8_min(i16x8_max(v, zero_v), max_v)
+}
+
+/// Writes 8 pixels of packed `u16` RGB (24 `u16` = 48 bytes) using
+/// the SSSE3‑style 3‑way interleave pattern adapted to 16‑bit lanes.
+/// Mirrors [`crate::row::arch::x86_common::write_rgb_u16_8`] — each
+/// output u16 is two adjacent bytes sourced from one of the three
+/// channel vectors via `u8x16_swizzle` with a compile‑time byte
+/// mask (0xFF / negative zeros the lane, matching `_mm_shuffle_epi8`
+/// semantics).
+///
+/// # Safety
+///
+/// `ptr` must point to at least 48 writable bytes (24 `u16`). Caller
+/// must have simd128 enabled at compile time.
+#[inline(always)]
+unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
+  unsafe {
+    // Block 0 = [R0 G0 B0 R1 G1 B1 R2 G2]. Masks identical in shape
+    // to x86_common::write_rgb_u16_8 — each output u16 pulls two
+    // adjacent bytes from one channel.
+    let r0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1);
+    let g0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5);
+    let b0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1);
+    let out0 = v128_or(
+      v128_or(u8x16_swizzle(r, r0), u8x16_swizzle(g, g0)),
+      u8x16_swizzle(b, b0),
+    );
+
+    // Block 1 = [B2 R3 G3 B3 R4 G4 B4 R5].
+    let r1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11);
+    let g1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1);
+    let b1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1);
+    let out1 = v128_or(
+      v128_or(u8x16_swizzle(r, r1), u8x16_swizzle(g, g1)),
+      u8x16_swizzle(b, b1),
+    );
+
+    // Block 2 = [G5 B5 R6 G6 B6 R7 G7 B7].
+    let r2 = i8x16(
+      -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1,
+    );
+    let g2 = i8x16(
+      10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1,
+    );
+    let b2 = i8x16(
+      -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15,
+    );
+    let out2 = v128_or(
+      v128_or(u8x16_swizzle(r, r2), u8x16_swizzle(g, g2)),
+      u8x16_swizzle(b, b2),
+    );
+
+    v128_store(ptr.cast(), out0);
+    v128_store(ptr.add(8).cast(), out1);
+    v128_store(ptr.add(16).cast(), out2);
+  }
+}
+
 /// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper
 /// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1008,4 +1320,118 @@ mod tests {
       check_hsv_equivalence(&rgb[..w * 3], w);
     }
   }
+
+  // ---- yuv420p10 scalar-equivalence -----------------------------------
+  //
+  // These tests compile only for `target_arch = "wasm32"` (via the
+  // outer `target_feature = "simd128"` gate on the module). CI
+  // executes them under wasmtime in the `test-wasm-simd128` job
+  // (see `.github/workflows/ci.yml`): the lib is compiled for
+  // `wasm32-wasip1` with `-C target-feature=+simd128` and
+  // `CARGO_TARGET_WASM32_WASIP1_RUNNER=wasmtime run --` passes each
+  // compiled `.wasm` test binary to wasmtime. Every scalar‑
+  // equivalence check below runs on real SIMD instructions, not
+  // just a compile check.
+
+  fn p10_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16)
+      .collect()
+  }
+
+  fn check_p10_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "simd128 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  fn check_p10_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "simd128 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn simd128_p10_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u8_simd128_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p10_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u16_simd128_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p10_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p10_u8_simd128_equivalence(w, ColorMatrix::Bt601, false);
+      check_p10_u16_simd128_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn simd128_p10_matches_scalar_1920() {
+    check_p10_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p10_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index beb3b6f..d1a1710 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -39,18 +39,18 @@
 //! element order. Every fixup is called out inline.
 
 use core::arch::x86_64::{
-  __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_castsi256_si128,
-  _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256, _mm256_loadu_si256,
-  _mm256_mullo_epi32, _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256,
-  _mm256_permute4x64_epi64, _mm256_set1_epi16, _mm256_set1_epi32, _mm256_setr_epi8,
-  _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16, _mm256_unpackhi_epi16,
-  _mm256_unpacklo_epi16,
+  __m256i, _mm_loadu_si128, _mm256_add_epi32, _mm256_adds_epi16, _mm256_and_si256,
+  _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256,
+  _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32,
+  _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16,
+  _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16,
+  _mm256_unpackhi_epi16, _mm256_unpacklo_epi16,
 };
 
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16},
+    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
     scalar,
   },
 };
@@ -207,6 +207,325 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
+/// AVX2 YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+///
+/// Block size 32 Y pixels per iteration (matching the 8‑bit AVX2
+/// kernel). Key differences:
+/// - Two `_mm256_loadu_si256` loads for Y (each 16 `u16` = 32 bytes);
+///   one load each for U / V (16 `u16` = 32 bytes).
+/// - No u8→i16 widening — 10‑bit samples already occupy 16‑bit lanes
+///   and fit i16 without overflow.
+/// - Chroma bias is 512 (10‑bit center).
+/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift.
+///
+/// Reuses [`chroma_i16x16`], [`chroma_dup`], [`scale_y`],
+/// [`narrow_u8x32`], and [`write_rgb_32`] from the 8‑bit path — the
+/// post‑chroma math is identical across bit depths.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: AVX2 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm256_set1_epi32(RND);
+    let y_off_v = _mm256_set1_epi16(y_off as i16);
+    let y_scale_v = _mm256_set1_epi32(y_scale);
+    let c_scale_v = _mm256_set1_epi32(c_scale);
+    let bias_v = _mm256_set1_epi16(bias as i16);
+    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let cru = _mm256_set1_epi32(coeffs.r_u());
+    let crv = _mm256_set1_epi32(coeffs.r_v());
+    let cgu = _mm256_set1_epi32(coeffs.g_u());
+    let cgv = _mm256_set1_epi32(coeffs.g_v());
+    let cbu = _mm256_set1_epi32(coeffs.b_u());
+    let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 32 <= width {
+      // 32 Y = two `_mm256_loadu_si256` (16 u16 each). U/V each = one
+      // load of 16 u16. AND‑mask each load to the low 10 bits — see
+      // matching comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`].
+      let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v);
+      let u_vec = _mm256_and_si256(
+        _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+      let v_vec = _mm256_and_si256(
+        _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+
+      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = _mm256_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm256_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm256_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm256_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm256_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm256_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = narrow_u8x32(b_lo, b_hi);
+      let g_u8 = narrow_u8x32(g_lo, g_hi);
+      let r_u8 = narrow_u8x32(r_lo, r_hi);
+
+      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 32;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// AVX2 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB.
+///
+/// Block size 32 Y pixels. Mirrors [`yuv420p10_to_rgb_row`]'s
+/// pre‑write math; output uses explicit min/max clamp to `[0, 1023]`
+/// (`_mm256_packus_epi16` would clip to u8). Writes are issued via
+/// four `write_rgb_u16_8` calls per 32‑pixel block — each extracts a
+/// 128‑bit half of the AVX2 `i16x16` channel vectors and hands them
+/// to the shared SSE4.1 u16 interleave helper. A 256‑bit AVX2 u16
+/// interleave would cut store count in half; left as a follow‑up
+/// optimization, since the u16 path is fidelity‑driven rather than
+/// throughput‑critical.
+///
+/// # Numerical contract
+///
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: AVX2 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm256_set1_epi32(RND);
+    let y_off_v = _mm256_set1_epi16(y_off as i16);
+    let y_scale_v = _mm256_set1_epi32(y_scale);
+    let c_scale_v = _mm256_set1_epi32(c_scale);
+    let bias_v = _mm256_set1_epi16(bias as i16);
+    let mask_v = _mm256_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let max_v = _mm256_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm256_set1_epi16(0);
+    let cru = _mm256_set1_epi32(coeffs.r_u());
+    let crv = _mm256_set1_epi32(coeffs.r_v());
+    let cgu = _mm256_set1_epi32(coeffs.g_u());
+    let cgv = _mm256_set1_epi32(coeffs.g_v());
+    let cbu = _mm256_set1_epi32(coeffs.b_u());
+    let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 32 <= width {
+      // AND‑mask loads to the low 10 bits so `chroma_i16x16`'s
+      // `_mm256_packs_epi32` narrow stays lossless.
+      let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v);
+      let u_vec = _mm256_and_si256(
+        _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+      let v_vec = _mm256_and_si256(
+        _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+
+      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      // Per‑channel saturating add + explicit clamp to [0, 1023].
+      let r_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      // Four 8‑pixel u16 writes per 32‑pixel block. Each extracts a
+      // 128‑bit half of an i16x16 channel and hands it to the shared
+      // SSE4.1 u16 interleave helper.
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_rgb_u16_8(
+        _mm256_castsi256_si128(r_lo),
+        _mm256_castsi256_si128(g_lo),
+        _mm256_castsi256_si128(b_lo),
+        dst,
+      );
+      write_rgb_u16_8(
+        _mm256_extracti128_si256::<1>(r_lo),
+        _mm256_extracti128_si256::<1>(g_lo),
+        _mm256_extracti128_si256::<1>(b_lo),
+        dst.add(24),
+      );
+      write_rgb_u16_8(
+        _mm256_castsi256_si128(r_hi),
+        _mm256_castsi256_si128(g_hi),
+        _mm256_castsi256_si128(b_hi),
+        dst.add(48),
+      );
+      write_rgb_u16_8(
+        _mm256_extracti128_si256::<1>(r_hi),
+        _mm256_extracti128_si256::<1>(g_hi),
+        _mm256_extracti128_si256::<1>(b_hi),
+        dst.add(72),
+      );
+
+      x += 32;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Clamps an `i16x16` vector to `[0, max]` via AVX2 `_mm256_min_epi16`
+/// / `_mm256_max_epi16`. Used by the 10‑bit u16 output path where
+/// `_mm256_packus_epi16` would incorrectly clip to u8.
+#[inline(always)]
+fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i {
+  unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) }
+}
+
 /// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -946,4 +1265,114 @@ mod tests {
       check_hsv_equivalence(&rgb[..w * 3], w);
     }
   }
+
+  // ---- yuv420p10 AVX2 scalar-equivalence ------------------------------
+
+  fn p10_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16)
+      .collect()
+  }
+
+  fn check_p10_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "AVX2 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  fn check_p10_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "AVX2 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx2_p10_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u8_avx2_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p10_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u16_avx2_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p10_matches_scalar_odd_tail_widths() {
+    for w in [34usize, 62, 66, 1922] {
+      check_p10_u8_avx2_equivalence(w, ColorMatrix::Bt601, false);
+      check_p10_u16_avx2_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn avx2_p10_matches_scalar_1920() {
+    check_p10_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p10_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index f0f1f93..0c8f941 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -54,8 +54,9 @@
 
 use core::arch::x86_64::{
   __m128i, __m512i, _mm_setr_epi8, _mm256_loadu_si256, _mm512_add_epi32, _mm512_adds_epi16,
-  _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256, _mm512_cvtepi16_epi32,
-  _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32, _mm512_extracti64x4_epi64, _mm512_loadu_si512,
+  _mm512_and_si512, _mm512_broadcast_i32x4, _mm512_castsi512_si128, _mm512_castsi512_si256,
+  _mm512_cvtepi16_epi32, _mm512_cvtepu8_epi16, _mm512_extracti32x4_epi32,
+  _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16,
   _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64,
   _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64,
   _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_sub_epi16, _mm512_unpackhi_epi16,
@@ -65,7 +66,7 @@ use core::arch::x86_64::{
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16},
+    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
     scalar,
   },
 };
@@ -223,6 +224,354 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
+/// AVX‑512 YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+///
+/// Block size 64 Y pixels / 32 chroma pairs per iteration (matching
+/// the 8‑bit AVX‑512 kernel). Structural differences:
+/// - Two `_mm512_loadu_si512` loads for Y (each 32 `u16` = 64 bytes);
+///   one `_mm512_loadu_si512` each for U / V (32 `u16`).
+/// - No u8→i16 widening — 10‑bit samples already occupy 16‑bit lanes.
+/// - Chroma bias is 512 (10‑bit center).
+/// - `range_params_n::<10, 8>` calibrates scales for 10→8 in one shift.
+///
+/// Reuses [`chroma_i16x32`], [`chroma_dup`], [`scale_y`],
+/// [`narrow_u8x64`], and [`write_rgb_64`] along with the pack / dup
+/// lane‑fixup indices from the 8‑bit path — post‑chroma math is
+/// identical across bit depths.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: AVX‑512BW availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm512_set1_epi32(RND);
+    let y_off_v = _mm512_set1_epi16(y_off as i16);
+    let y_scale_v = _mm512_set1_epi32(y_scale);
+    let c_scale_v = _mm512_set1_epi32(c_scale);
+    let bias_v = _mm512_set1_epi16(bias as i16);
+    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let cru = _mm512_set1_epi32(coeffs.r_u());
+    let crv = _mm512_set1_epi32(coeffs.r_v());
+    let cgu = _mm512_set1_epi32(coeffs.g_u());
+    let cgv = _mm512_set1_epi32(coeffs.g_v());
+    let cbu = _mm512_set1_epi32(coeffs.b_u());
+    let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+    let mut x = 0usize;
+    while x + 64 <= width {
+      // AND‑mask every load to the low 10 bits — see matching
+      // comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`].
+      let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v);
+      let u_vec = _mm512_and_si512(
+        _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+      let v_vec = _mm512_and_si512(
+        _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+
+      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+      let b_lo = _mm512_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm512_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm512_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm512_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm512_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm512_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup);
+      let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
+      let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
+
+      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 64;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// AVX‑512 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB.
+///
+/// Block size 64 Y pixels per iteration. Mirrors
+/// [`yuv420p10_to_rgb_row`]'s pre‑write math; output uses explicit
+/// min/max clamp to `[0, 1023]` and 8 calls to [`write_rgb_u16_8`]
+/// per block (each handles 8 pixels). A true AVX‑512 u16 interleave
+/// would cut store count ~8×; left as a follow‑up optimization.
+///
+/// # Numerical contract
+///
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: AVX‑512BW availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm512_set1_epi32(RND);
+    let y_off_v = _mm512_set1_epi16(y_off as i16);
+    let y_scale_v = _mm512_set1_epi32(y_scale);
+    let c_scale_v = _mm512_set1_epi32(c_scale);
+    let bias_v = _mm512_set1_epi16(bias as i16);
+    let mask_v = _mm512_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let max_v = _mm512_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm512_set1_epi16(0);
+    let cru = _mm512_set1_epi32(coeffs.r_u());
+    let crv = _mm512_set1_epi32(coeffs.r_v());
+    let cgu = _mm512_set1_epi32(coeffs.g_u());
+    let cgv = _mm512_set1_epi32(coeffs.g_v());
+    let cbu = _mm512_set1_epi32(coeffs.b_u());
+    let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+    let mut x = 0usize;
+    while x + 64 <= width {
+      // AND‑mask loads to the low 10 bits so `chroma_i16x32`'s
+      // `_mm512_packs_epi32` narrow stays lossless.
+      let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v);
+      let u_vec = _mm512_and_si512(
+        _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+      let v_vec = _mm512_and_si512(
+        _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()),
+        mask_v,
+      );
+
+      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+      let r_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      // Eight 8‑pixel u16 writes per 64‑pixel block. For each i16x32
+      // channel vector we extract four 128‑bit quarters and hand each
+      // to the shared SSE4.1 u16 interleave helper.
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_quarter(r_lo, g_lo, b_lo, 0, dst);
+      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+
+      x += 64;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Clamps an `i16x32` vector to `[0, max]` via AVX‑512
+/// `_mm512_min_epi16` / `_mm512_max_epi16`. Used by the 10‑bit u16
+/// output path.
+#[inline(always)]
+fn clamp_u10_x32(v: __m512i, zero_v: __m512i, max_v: __m512i) -> __m512i {
+  unsafe { _mm512_min_epi16(_mm512_max_epi16(v, zero_v), max_v) }
+}
+
+/// Writes one 8‑pixel u16 RGB chunk using a 128‑bit quarter of each
+/// `i16x32` channel vector. `idx` ∈ `{0,1,2,3}` selects which of the
+/// four 128‑bit lanes to extract via `_mm512_extracti32x4_epi32`.
+///
+/// # Safety
+///
+/// Same as [`write_rgb_u16_8`] — `ptr` must point to at least 48
+/// writable bytes (24 `u16`). Caller's `target_feature` must include
+/// AVX‑512F + AVX‑512BW (so `_mm512_extracti32x4_epi32` is available)
+/// and SSSE3 (for the underlying `_mm_shuffle_epi8` inside
+/// `write_rgb_u16_8`).
+#[inline(always)]
+unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u16) {
+  // SAFETY: caller holds the AVX‑512F + SSSE3 target‑feature context.
+  // Constant generic arg `IDX` picks one of four 128‑bit lanes; `idx`
+  // is bounded to 0..=3 by call sites.
+  unsafe {
+    let (rq, gq, bq) = match idx {
+      0 => (
+        _mm512_extracti32x4_epi32::<0>(r),
+        _mm512_extracti32x4_epi32::<0>(g),
+        _mm512_extracti32x4_epi32::<0>(b),
+      ),
+      1 => (
+        _mm512_extracti32x4_epi32::<1>(r),
+        _mm512_extracti32x4_epi32::<1>(g),
+        _mm512_extracti32x4_epi32::<1>(b),
+      ),
+      2 => (
+        _mm512_extracti32x4_epi32::<2>(r),
+        _mm512_extracti32x4_epi32::<2>(g),
+        _mm512_extracti32x4_epi32::<2>(b),
+      ),
+      _ => (
+        _mm512_extracti32x4_epi32::<3>(r),
+        _mm512_extracti32x4_epi32::<3>(g),
+        _mm512_extracti32x4_epi32::<3>(b),
+      ),
+    };
+    write_rgb_u16_8(rq, gq, bq, ptr);
+  }
+}
+
 /// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -966,4 +1315,114 @@ mod tests {
       check_hsv_equivalence(&rgb[..w * 3], w);
     }
   }
+
+  // ---- yuv420p10 AVX-512 scalar-equivalence ---------------------------
+
+  fn p10_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16)
+      .collect()
+  }
+
+  fn check_p10_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "AVX-512 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  fn check_p10_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "AVX-512 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx512_p10_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u8_avx512_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p10_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u16_avx512_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p10_matches_scalar_odd_tail_widths() {
+    for w in [66usize, 126, 130, 1922] {
+      check_p10_u8_avx512_equivalence(w, ColorMatrix::Bt601, false);
+      check_p10_u16_avx512_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn avx512_p10_matches_scalar_1920() {
+    check_p10_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p10_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs
index b78827d..14dc783 100644
--- a/src/row/arch/x86_common.rs
+++ b/src/row/arch/x86_common.rs
@@ -84,6 +84,74 @@ pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut
   }
 }
 
+/// Writes 8 pixels of packed **`u16`** RGB (48 bytes = 24 `u16`)
+/// from three `u16x8` channel vectors. Drives the SSE4.1 / AVX2 /
+/// AVX‑512 high‑bit‑depth kernels' u16 output path.
+///
+/// Three output blocks of 16 bytes (8 `u16`) each hold:
+/// - Block 0: `R0, G0, B0, R1, G1, B1, R2, G2` (u16 indices 0..7)
+/// - Block 1: `B2, R3, G3, B3, R4, G4, B4, R5`
+/// - Block 2: `G5, B5, R6, G6, B6, R7, G7, B7`
+///
+/// Each block is the OR of three `_mm_shuffle_epi8` gathers — one
+/// from each of R, G, B — with the byte mask picking a pair of
+/// adjacent bytes (lo, hi) for every `u16` sourced from that
+/// channel. 0x80 (`-1` as i8) zeros the lane, to be OR'd in by
+/// another channel's contribution.
+///
+/// # Safety
+///
+/// - `ptr` must point to at least 48 writable bytes (aligned or
+///   unaligned — we use `storeu`).
+/// - The calling function must have SSSE3 available (via SSE4.1 or
+///   a superset like AVX2 / AVX‑512BW).
+#[inline(always)]
+pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u16) {
+  unsafe {
+    // Block 0 = [R0 G0 B0 R1 G1 B1 R2 G2] — 8 `u16` = 16 bytes.
+    // R contributes pairs (0,1), (6,7), (12,13); G pairs (2,3), (8,9),
+    // (14,15); B pairs (4,5), (10,11).
+    let r0 = _mm_setr_epi8(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1);
+    let g0 = _mm_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5);
+    let b0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1);
+    let out0 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
+      _mm_shuffle_epi8(b, b0),
+    );
+
+    // Block 1 = [B2 R3 G3 B3 R4 G4 B4 R5]. R pairs (6,7), (8,9),
+    // (10,11); G pairs (6,7), (8,9); B pairs (4,5), (6,7), (8,9).
+    let r1 = _mm_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11);
+    let g1 = _mm_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1);
+    let b1 = _mm_setr_epi8(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1);
+    let out1 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
+      _mm_shuffle_epi8(b, b1),
+    );
+
+    // Block 2 = [G5 B5 R6 G6 B6 R7 G7 B7]. R pairs (12,13), (14,15);
+    // G pairs (10,11), (12,13), (14,15); B pairs (10,11), (12,13),
+    // (14,15).
+    let r2 = _mm_setr_epi8(
+      -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1,
+    );
+    let g2 = _mm_setr_epi8(
+      10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1,
+    );
+    let b2 = _mm_setr_epi8(
+      -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15,
+    );
+    let out2 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
+      _mm_shuffle_epi8(b, b2),
+    );
+
+    _mm_storeu_si128(ptr.cast(), out0);
+    _mm_storeu_si128(ptr.add(8).cast(), out1);
+    _mm_storeu_si128(ptr.add(16).cast(), out2);
+  }
+}
+
 /// Swaps the outer two channels of 16 packed 3‑byte pixels (48 bytes
 /// in, 48 bytes out). Drives both BGR→RGB and RGB→BGR conversions
 /// since the transformation is self‑inverse.
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 3ef5988..297f1fb 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -36,16 +36,16 @@
 //!    `super::x86_common::write_rgb_16`.
 
 use core::arch::x86_64::{
-  __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16, _mm_loadl_epi64,
-  _mm_loadu_si128, _mm_mullo_epi32, _mm_packs_epi32, _mm_packus_epi16, _mm_set1_epi16,
-  _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8, _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16,
-  _mm_unpackhi_epi16, _mm_unpacklo_epi16,
+  __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16,
+  _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32,
+  _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8,
+  _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, _mm_unpacklo_epi16,
 };
 
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16},
+    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
     scalar,
   },
 };
@@ -197,6 +197,272 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
 ///
 /// # Safety
 ///
+/// SSE4.1 YUV 4:2:0 10‑bit → packed **8‑bit** RGB.
+///
+/// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors
+/// [`yuv_420_to_rgb_row`] with three structural differences:
+/// - Two `_mm_loadu_si128` loads for Y (each pulls 8 `u16` = 16 bytes);
+///   U/V each load 8 `u16` via one `_mm_loadu_si128`. No u8 widening —
+///   the samples already occupy 16‑bit lanes.
+/// - Chroma bias is 512 (10‑bit center).
+/// - `range_params_n::<10, 8>` calibrates `y_scale` / `c_scale` to
+///   map 10‑bit input directly to 8‑bit output in one Q15 shift.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: SSE4.1 availability is the caller's obligation; the
+  // dispatcher in `crate::row` verifies it. Pointer adds are bounded
+  // by the `while x + 16 <= width` loop condition and the caller‑
+  // promised slice lengths.
+  unsafe {
+    let rnd_v = _mm_set1_epi32(RND);
+    let y_off_v = _mm_set1_epi16(y_off as i16);
+    let y_scale_v = _mm_set1_epi32(y_scale);
+    let c_scale_v = _mm_set1_epi32(c_scale);
+    let bias_v = _mm_set1_epi16(bias as i16);
+    let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let cru = _mm_set1_epi32(coeffs.r_u());
+    let crv = _mm_set1_epi32(coeffs.r_v());
+    let cgu = _mm_set1_epi32(coeffs.g_u());
+    let cgv = _mm_set1_epi32(coeffs.g_v());
+    let cbu = _mm_set1_epi32(coeffs.b_u());
+    let cbv = _mm_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // 16 Y = two `u16x8` loads; 8 U + 8 V = one load each. Each
+      // load is AND‑masked to the low 10 bits (see matching comment
+      // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]). Valid
+      // 10‑bit samples ≤ 1023 pass through unchanged.
+      let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v);
+      let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v);
+      let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v);
+
+      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma);
+      let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma);
+      let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma);
+      let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma);
+      let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma);
+      let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = _mm_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = _mm_packus_epi16(b_lo, b_hi);
+      let g_u8 = _mm_packus_epi16(g_lo, g_hi);
+      let r_u8 = _mm_packus_epi16(r_lo, r_hi);
+
+      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// SSE4.1 YUV 4:2:0 10‑bit → packed **10‑bit `u16`** RGB.
+///
+/// Block size 16 Y pixels per iteration; writes two 8‑pixel u16 RGB
+/// chunks via [`write_rgb_u16_8`]. Shares all pre‑write math with the
+/// u8 output path; the key differences:
+/// - `range_params_n::<10, 10>` → `y_scale` / `c_scale` target the
+///   10‑bit output range (values in `[0, 1023]` at Q15 exit).
+/// - Clamp is explicit min/max to `[0, 1023]` — `_mm_packus_epi16`
+///   would clip to u8, so we can't reuse it here.
+///
+/// # Numerical contract
+///
+/// Identical to [`scalar::yuv_420p_n_to_rgb_u16_row::<10>`].
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(u_half.len() >= width / 2);
+  debug_assert!(v_half.len() >= width / 2);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: SSE4.1 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm_set1_epi32(RND);
+    let y_off_v = _mm_set1_epi16(y_off as i16);
+    let y_scale_v = _mm_set1_epi32(y_scale);
+    let c_scale_v = _mm_set1_epi32(c_scale);
+    let bias_v = _mm_set1_epi16(bias as i16);
+    let mask_v = _mm_set1_epi16(scalar::bits_mask::<10>() as i16);
+    let max_v = _mm_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm_set1_epi16(0);
+    let cru = _mm_set1_epi32(coeffs.r_u());
+    let crv = _mm_set1_epi32(coeffs.r_v());
+    let cgu = _mm_set1_epi32(coeffs.g_u());
+    let cgv = _mm_set1_epi32(coeffs.g_v());
+    let cbu = _mm_set1_epi32(coeffs.b_u());
+    let cbv = _mm_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // AND‑mask each load to the low 10 bits — critical for the
+      // u16 output path since its larger `y_scale` / `c_scale`
+      // (32768 for 10→10 full range) would let an out‑of‑range
+      // sample push a `coeff * v_d` product past i16 range,
+      // triggering information loss in the subsequent
+      // `_mm_packs_epi32` narrow step inside `chroma_i16x8`.
+      let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v);
+      let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v);
+      let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v);
+      let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v);
+
+      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma);
+      let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma);
+      let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma);
+      let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma);
+      let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma);
+      let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      // Per‑channel sum + clamp to [0, 1023].
+      let r_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      // Two 8‑pixel u16 writes cover the 16‑pixel block.
+      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
+      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::yuv_420p_n_to_rgb_u16_row::<10>(
+        &y[x..width],
+        &u_half[x / 2..width / 2],
+        &v_half[x / 2..width / 2],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Clamps an i16x8 vector to `[0, max]` for the 10‑bit u16 output
+/// path. `_mm_packus_epi16` would clip to u8, so we use explicit
+/// min/max.
+#[inline(always)]
+fn clamp_u10(v: __m128i, zero_v: __m128i, max_v: __m128i) -> __m128i {
+  unsafe { _mm_min_epi16(_mm_max_epi16(v, zero_v), max_v) }
+}
+
 /// Same as [`nv12_or_nv21_to_rgb_row_impl`].
 #[inline]
 #[target_feature(enable = "sse4.1")]
@@ -836,4 +1102,114 @@ mod tests {
       check_hsv_equivalence(&rgb[..w * 3], w);
     }
   }
+
+  // ---- yuv420p10 scalar-equivalence -----------------------------------
+
+  fn p10_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| ((i * seed + seed * 3) & 0x3FF) as u16)
+      .collect()
+  }
+
+  fn check_p10_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "SSE4.1 10→u8 diverges at byte {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  fn check_p10_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p10_plane(width, 37);
+    let u = p10_plane(width / 2, 53);
+    let v = p10_plane(width / 2, 71);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+
+    scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      yuv420p10_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range);
+    }
+
+    if rgb_scalar != rgb_simd {
+      let first_diff = rgb_scalar
+        .iter()
+        .zip(rgb_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "SSE4.1 10→u16 diverges at elem {first_diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgb_scalar[first_diff], rgb_simd[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn sse41_p10_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u8_sse41_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p10_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p10_u16_sse41_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p10_matches_scalar_odd_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p10_u8_sse41_equivalence(w, ColorMatrix::Bt601, false);
+      check_p10_u16_sse41_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn sse41_p10_matches_scalar_1920() {
+    check_p10_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p10_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index abe4782..788789b 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -314,6 +314,190 @@ pub fn nv21_to_rgb_row(
   scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 10 active bits in the low bits of each
+/// element. Output is packed `R, G, B` bytes (`3 * width` bytes),
+/// with the conversion clamping to `[0, 255]` — the native‑depth
+/// path is [`yuv420p10_to_rgb_u16_row`].
+///
+/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv420p10_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv420p10_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv420p10_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv420p10_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv420p10_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed
+/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`,
+/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless
+/// downstream HDR processing when the consumer expects low‑bit‑packed
+/// samples.
+///
+/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16`
+/// elements, each in `[0, 1023]` with the upper 6 bits zero.
+///
+/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
+/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
+/// feeding this output into a p010 consumer must shift left by 6
+/// before handing off.
+///
+/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv420p10_to_rgb_u16_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv420p10_to_rgb_u16_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv420p10_to_rgb_u16_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv420p10_to_rgb_u16_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv420p10_to_rgb_u16_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
 /// encoding). See `scalar::rgb_to_hsv_row` for semantics.
 ///
@@ -490,6 +674,21 @@ fn rgb_row_bytes(width: usize) -> usize {
   }
 }
 
+/// Element count of one packed `u16`‑RGB row (`width × 3`). Identical
+/// math to [`rgb_row_bytes`] — the returned value is in `u16`
+/// elements, not bytes. Callers use it to size `&mut [u16]` buffers
+/// for the `u16` output path. `width × 3` overflow still matters on
+/// 32‑bit targets: the product names the number of elements the
+/// caller allocates, and downstream SIMD kernels index with it
+/// directly without re‑multiplying.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn rgb_row_elems(width: usize) -> usize {
+  match width.checked_mul(3) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 3 overflows usize"),
+  }
+}
+
 // ---- runtime CPU feature detection -----------------------------------
 //
 // Each `*_available` helper returns `true` iff the named feature is
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 34ab55e..9c6afa4 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -175,6 +175,233 @@ fn clamp_u8(v: i32) -> u8 {
   v.clamp(0, 255) as u8
 }
 
+// ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) -------------
+
+/// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the
+/// low `BITS` bits of each element) directly to **8-bit** packed RGB.
+///
+/// `BITS` is the active input bit depth (10/12/14). Chroma bias is
+/// `128 << (BITS - 8)` and the Q15 coefficients plus i32 intermediates
+/// work unchanged across all three depths — only the range‑scaling
+/// params ([`range_params_n`]) change with `BITS`. 16‑bit input is
+/// not handled here because the i32 chroma sum would overflow.
+///
+/// Output semantics match [`yuv_420_to_rgb_row`]: the final clamp is
+/// to `[0, 255]`, so the scale inside [`range_params_n`] targets an
+/// 8‑bit output range — the kernel sheds the extra `BITS - 8` bits of
+/// source precision inline rather than converting first at `BITS` and
+/// then downshifting. This keeps the fast path a single Q15 shift.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `u_half.len() >= width / 2`,
+///   `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  debug_assert!(y.len() >= width, "y row too short");
+  debug_assert!(u_half.len() >= width / 2, "u_half row too short");
+  debug_assert!(v_half.len() >= width / 2, "v_half row too short");
+  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+
+  let coeffs = Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = range_params_n::<BITS, 8>(full_range);
+  let bias = chroma_bias::<BITS>();
+  let mask = bits_mask::<BITS>();
+
+  // Every sample is AND‑masked to the low `BITS` bits on load. This
+  // eliminates architecture‑dependent divergence on mispacked input
+  // (e.g. `p010`‑style buffers where the 10 active bits sit in the
+  // high bits of each `u16`): after masking, every backend sees the
+  // same in‑range sample, so the whole Q15 pipeline stays bounded
+  // (intermediate chroma sums fit i16 as designed, no saturating
+  // narrow loses information). For valid input every mask is a
+  // no‑op. For malformed input the "wrong" output is identical
+  // across scalar + all 5 SIMD backends.
+  let mut x = 0;
+  while x < width {
+    let c_idx = x / 2;
+    let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale);
+    let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale);
+
+    let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
+    let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
+    let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
+
+    let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale);
+    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
+    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
+    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+
+    let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale);
+    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
+    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
+    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+
+    x += 2;
+  }
+}
+
+/// `(sample * scale_q15 + RND) >> 15`. With input masked to BITS,
+/// the `sample * scale` product cannot overflow i32 for any
+/// reasonable `OUT_BITS ≤ 16`, so plain arithmetic is sufficient.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn q15_scale(sample: i32, scale_q15: i32) -> i32 {
+  (sample * scale_q15 + (1 << 14)) >> 15
+}
+
+/// `(c_u * u_d + c_v * v_d + RND) >> 15`. Chroma sum max ≈ 10⁹ for
+/// 14‑bit masked input, well within i32.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn q15_chroma(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 {
+  (c_u * u_d + c_v * v_d + (1 << 14)) >> 15
+}
+
+/// Converts one row of high‑bit‑depth 4:2:0 YUV to **`u16`** packed
+/// RGB at the **input's native bit depth** (`BITS`).
+///
+/// Output is **low‑bit‑packed**: for 10‑bit input each `u16` holds a
+/// value in `[0, 1023]` with the upper 6 bits zero — matching
+/// FFmpeg's `yuv420p10le` convention. 12‑ and 14‑bit inputs produce
+/// `[0, 4095]` / `[0, 16383]` respectively, again in the low bits.
+///
+/// This is **not** the FFmpeg `p010` layout: `p010` puts samples in
+/// the **high** 10 bits of each `u16` (effectively `sample << 6`).
+/// Callers routing this output to a p010 consumer must shift left
+/// by `16 - BITS`.
+///
+/// This is the fidelity‑preserving path: no bits are shed inside the
+/// conversion, so the output retains the full dynamic range of the
+/// source for HDR tone mapping, 10‑bit scene analysis, and similar
+/// downstream work. Callers who only need 8‑bit output should prefer
+/// [`yuv_420p_n_to_rgb_row`], which is ~2× faster.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `u_half.len() >= width / 2`,
+///   `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  debug_assert!(y.len() >= width, "y row too short");
+  debug_assert!(u_half.len() >= width / 2, "u_half row too short");
+  debug_assert!(v_half.len() >= width / 2, "v_half row too short");
+  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+
+  let coeffs = Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = range_params_n::<BITS, BITS>(full_range);
+  let bias = chroma_bias::<BITS>();
+  let out_max: i32 = (1i32 << BITS) - 1;
+  let mask = bits_mask::<BITS>();
+
+  // Every sample AND‑masked to the low `BITS` bits — see matching
+  // comment in [`yuv_420p_n_to_rgb_row`]. Critical for the native‑
+  // depth u16 output path: `range_params_n::<10, 10>` uses
+  // `y_scale = c_scale = 32768` (unit Q15 for BITS==OUT_BITS full
+  // range), so an unmasked out‑of‑range sample would push `u_d` /
+  // `v_d` to ±32256 and the subsequent `coeff * v_d` exceeds i16
+  // range — breaking the SIMD kernels' `vqmovn_s32` narrow step.
+  // Masking keeps every intermediate bounded by design.
+  let mut x = 0;
+  while x < width {
+    let c_idx = x / 2;
+    let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale);
+    let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale);
+
+    let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
+    let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
+    let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
+
+    let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale);
+    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+
+    let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale);
+    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+
+    x += 2;
+  }
+}
+
+/// Compile‑time sample mask for `BITS`: `(1 << BITS) - 1` as `u16`.
+/// Returns `0x03FF` for 10‑bit, `0x0FFF` for 12‑bit, `0x3FFF` for
+/// 14‑bit. SIMD backends splat this into a vector constant and AND
+/// every load against it.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) const fn bits_mask<const BITS: u32>() -> u16 {
+  ((1u32 << BITS) - 1) as u16
+}
+
+/// Chroma bias for input bit depth `BITS` — `128 << (BITS - 8)`.
+/// 128 for 8‑bit, 512 for 10‑bit, 2048 for 12‑bit, 8192 for 14‑bit.
+/// Exposed at module visibility so SIMD backends can reuse it.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) const fn chroma_bias<const BITS: u32>() -> i32 {
+  128i32 << (BITS - 8)
+}
+
+/// Range‑scaling params `(y_off, y_scale_q15, c_scale_q15)` for the
+/// high‑bit‑depth kernel family.
+///
+/// `BITS` is the input bit depth (10 / 12 / 14); `OUT_BITS` is the
+/// target output range (8 for u8‑packed RGB, equal to `BITS` for
+/// native‑depth `u16` output).
+///
+/// The scales are chosen so that after `((sample - y_off) * scale + RND) >> 15`
+/// the result lies in `[0, (1 << OUT_BITS) - 1]` without further
+/// downshifting. This keeps the fast path a single Q15 multiply for
+/// both output widths.
+///
+/// - Full range: luma and chroma both use the same scale, mapping
+///   `[0, in_max]` to `[0, out_max]`. Same shape as 8‑bit's
+///   `(0, 1<<15, 1<<15)` for `BITS == OUT_BITS`.
+/// - Limited range: luma maps `[16·k, 235·k]` to `[0, out_max]`,
+///   chroma maps `[16·k, 240·k]` to `[0, out_max]`, where
+///   `k = 1 << (BITS - 8)`. Matches FFmpeg's `AVCOL_RANGE_MPEG`
+///   semantics.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) const fn range_params_n<const BITS: u32, const OUT_BITS: u32>(
+  full_range: bool,
+) -> (i32, i32, i32) {
+  let in_max: i64 = (1i64 << BITS) - 1;
+  let out_max: i64 = (1i64 << OUT_BITS) - 1;
+  if full_range {
+    // `scale = round((out_max << 15) / in_max)`. For `BITS == OUT_BITS`
+    // the quotient is exactly `1 << 15` (no rounding needed); for
+    // 10‑bit→8‑bit it's `(255 << 15) / 1023 ≈ 8167.5`, which rounds to 8168.
+    let scale = ((out_max << 15) + in_max / 2) / in_max;
+    (0, scale as i32, scale as i32)
+  } else {
+    let y_off = 16i32 << (BITS - 8);
+    let y_range: i64 = 219i64 << (BITS - 8);
+    let c_range: i64 = 224i64 << (BITS - 8);
+    let y_scale = ((out_max << 15) + y_range / 2) / y_range;
+    let c_scale = ((out_max << 15) + c_range / 2) / c_range;
+    (y_off, y_scale as i32, c_scale as i32)
+  }
+}
+
 /// Range-scaling params: `(y_off, y_scale_q15, c_scale_q15)`.
 ///
 /// Full range: no offset, unit scales (Q15 = 2^15).
@@ -620,4 +847,147 @@ mod tests {
     rgb_to_hsv_row(&rgb, &mut h, &mut s, &mut v, 1);
     assert_eq!((h[0], s[0], v[0]), (120, 255, 255));
   }
+
+  // ---- yuv_420p_n_to_rgb_row (10-bit → u8) -----------------------------
+
+  #[test]
+  fn yuv420p10_rgb_black_full_range() {
+    // Y=0, neutral chroma (512 in 10-bit) → black.
+    let y = [0u16; 4];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}");
+  }
+
+  #[test]
+  fn yuv420p10_rgb_white_full_range() {
+    // 10-bit full-range white is Y=1023.
+    let y = [1023u16; 4];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}");
+  }
+
+  #[test]
+  fn yuv420p10_rgb_gray_is_gray() {
+    // Mid-gray 10-bit Y=512 ↔ 8-bit 128. Within ±1 for Q15 rounding.
+    let y = [512u16; 4];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    for x in 0..4 {
+      let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]);
+      assert_eq!(r, g);
+      assert_eq!(g, b);
+      assert!(r.abs_diff(128) <= 1, "got {r}");
+    }
+  }
+
+  #[test]
+  fn yuv420p10_rgb_limited_range_black_and_white() {
+    // 10-bit limited: Y=64 → black, Y=940 → white.
+    let y = [64u16, 64, 940, 940];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, false);
+    assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
+    assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0));
+    assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255));
+    assert_eq!((rgb[9], rgb[10], rgb[11]), (255, 255, 255));
+  }
+
+  #[test]
+  fn yuv420p10_rgb_chroma_shared_across_pair() {
+    // Two 10-bit Y values sharing chroma: output is gray = Y>>2.
+    let y = [200u16, 800, 200, 800];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    // Full-range 10→8 scale = 255/1023, so Y=200 → 50, Y=800 → 199.4 → 199.
+    // Allow ±1 for Q15 rounding.
+    assert!(rgb[0].abs_diff(50) <= 1, "got {}", rgb[0]);
+    assert!(rgb[3].abs_diff(199) <= 1, "got {}", rgb[3]);
+    assert!(rgb[6].abs_diff(50) <= 1, "got {}", rgb[6]);
+    assert!(rgb[9].abs_diff(199) <= 1, "got {}", rgb[9]);
+  }
+
+  // ---- yuv_420p_n_to_rgb_u16_row (10-bit → 10-bit u16) ----------------
+
+  #[test]
+  fn yuv420p10_rgb_u16_black_full_range() {
+    let y = [0u16; 4];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u16; 12];
+    yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}");
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u16_white_full_range() {
+    // 10-bit input Y=1023, full-range scale=1 → output Y=1023 on each channel.
+    let y = [1023u16; 4];
+    let u = [512u16; 2];
+    let v = [512u16; 2];
+    let mut rgb = [0u16; 12];
+    yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}");
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u16_limited_range_endpoints() {
+    // Limited-range: Y=64 → 0, Y=940 → 1023 in 10-bit output.
+    let y = [64u16, 940];
+    let u = [512u16; 1];
+    let v = [512u16; 1];
+    let mut rgb = [0u16; 6];
+    yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 2, ColorMatrix::Bt709, false);
+    assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
+    assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023));
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u16_preserves_full_10bit_precision() {
+    // Sanity: the u16 path retains native-depth precision, so two
+    // inputs that round to the same u8 are distinguishable in u16.
+    // Full-range Y=200 vs Y=201: same u8 output (50 vs 50) but
+    // distinct u16 outputs (200 vs 201).
+    let y = [200u16, 201];
+    let u = [512u16; 1];
+    let v = [512u16; 1];
+    let mut rgb8 = [0u8; 6];
+    let mut rgb16 = [0u16; 6];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb8, 2, ColorMatrix::Bt601, true);
+    yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16, 2, ColorMatrix::Bt601, true);
+    assert_eq!(rgb8[0], rgb8[3]);
+    assert_ne!(rgb16[0], rgb16[3]);
+  }
+
+  #[test]
+  fn yuv420p10_bt709_ycgco_differ_for_chroma() {
+    // Non-neutral chroma — different matrices produce different RGB.
+    let y = [512u16; 2];
+    let u = [512u16; 1];
+    let v = [800u16; 1];
+    let mut bt709 = [0u8; 6];
+    let mut ycgco = [0u8; 6];
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut bt709, 2, ColorMatrix::Bt709, true);
+    yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut ycgco, 2, ColorMatrix::YCgCo, true);
+    let sad: i32 = bt709
+      .iter()
+      .zip(ycgco.iter())
+      .map(|(a, b)| (*a as i32 - *b as i32).abs())
+      .sum();
+    assert!(
+      sad > 20,
+      "matrices should materially differ: {bt709:?} vs {ycgco:?}"
+    );
+  }
 }
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 3ad5ace..432994e 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -18,8 +18,14 @@ use thiserror::Error;
 
 use crate::{
   HsvBuffers, PixelSink, SourceFormat,
-  row::{nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row},
-  yuv::{Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420pRow, Yuv420pSink},
+  row::{
+    nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row,
+    yuv420p10_to_rgb_u16_row,
+  },
+  yuv::{
+    Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420p10, Yuv420p10Row,
+    Yuv420p10Sink, Yuv420pRow, Yuv420pSink,
+  },
 };
 
 /// Errors returned by [`MixedSinker`] configuration and per-frame
@@ -58,6 +64,19 @@ pub enum MixedSinkerError {
     actual: usize,
   },
 
+  /// `u16` RGB buffer attached via [`MixedSinker::with_rgb_u16`] /
+  /// [`MixedSinker::set_rgb_u16`] is shorter than `width × height × 3`
+  /// `u16` elements. Only the high‑bit‑depth source impls
+  /// (currently [`Yuv420p10`](crate::yuv::Yuv420p10)) write into this
+  /// buffer.
+  #[error("MixedSinker rgb_u16 buffer too short: expected >= {expected} elements, got {actual}")]
+  RgbU16BufferTooShort {
+    /// Minimum `u16` elements required (`width × height × 3`).
+    expected: usize,
+    /// `u16` elements supplied.
+    actual: usize,
+  },
+
   /// Luma buffer is shorter than `width × height`.
   #[error("MixedSinker luma buffer too short: expected >= {expected} bytes, got {actual}")]
   LumaBufferTooShort {
@@ -97,17 +116,24 @@ pub enum MixedSinkerError {
   /// direct `process` callers that bypass the walker (hand-crafted
   /// rows, replayed rows, etc.) before a wrong-shaped slice reaches
   /// an unsafe SIMD kernel.
+  ///
+  /// Lengths are expressed in **slice elements** — `u8` bytes for
+  /// the 8‑bit source rows (Y, U/V half, UV/VU half) and `u16`
+  /// elements for the 10‑bit source rows (Y10, U/V half 10). The
+  /// message deliberately says "elements" rather than "bytes" so the
+  /// same variant can serve both the `u8` and `u16` row families.
   #[error(
-    "MixedSinker row shape mismatch at row {row}: {which} slice has {actual} bytes, expected {expected}"
+    "MixedSinker row shape mismatch at row {row}: {which} slice has {actual} elements, expected {expected}"
   )]
   RowShapeMismatch {
     /// Which slice mismatched. See [`RowSlice`] for variants.
     which: RowSlice,
     /// Row index reported by the offending row.
     row: usize,
-    /// Expected slice length in bytes (given the sink's configured width).
+    /// Expected slice length in elements of the slice's element type
+    /// (`u8` for 8‑bit source rows; `u16` for 10‑bit source rows).
     expected: usize,
-    /// Actual slice length supplied by the row.
+    /// Actual slice length in the same unit as `expected`.
     actual: usize,
   },
 
@@ -181,6 +207,18 @@ pub enum RowSlice {
   /// pairs — byte order swapped relative to [`Self::UvHalf`].
   #[display("VU Half")]
   VuHalf,
+  /// Full‑width Y row of a **10‑bit** planar source ([`Yuv420p10`]).
+  /// `u16` samples, `width` elements.
+  #[display("Y10")]
+  Y10,
+  /// Half‑width U row of a **10‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("U Half 10")]
+  UHalf10,
+  /// Half‑width V row of a **10‑bit** planar source. `u16` samples,
+  /// `width / 2` elements.
+  #[display("V Half 10")]
+  VHalf10,
 }
 
 /// A sink that writes any subset of `{RGB, Luma, HSV}` into
@@ -206,6 +244,7 @@ pub enum RowSlice {
 /// [`Nv21`](crate::yuv::Nv21).
 pub struct MixedSinker<'a, F: SourceFormat> {
   rgb: Option<&'a mut [u8]>,
+  rgb_u16: Option<&'a mut [u16]>,
   luma: Option<&'a mut [u8]>,
   hsv: Option<HsvBuffers<'a>>,
   width: usize,
@@ -231,6 +270,7 @@ impl<F: SourceFormat> MixedSinker<'_, F> {
   pub fn new(width: usize, height: usize) -> Self {
     Self {
       rgb: None,
+      rgb_u16: None,
       luma: None,
       hsv: None,
       width,
@@ -241,12 +281,22 @@ impl<F: SourceFormat> MixedSinker<'_, F> {
     }
   }
 
-  /// Returns `true` iff the sinker will write RGB.
+  /// Returns `true` iff the sinker will write 8‑bit RGB.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub const fn produces_rgb(&self) -> bool {
     self.rgb.is_some()
   }
 
+  /// Returns `true` iff the sinker will write `u16` RGB at the
+  /// source's native bit depth. Only high‑bit‑depth source impls
+  /// (currently [`Yuv420p10`](crate::yuv::Yuv420p10)) honor this
+  /// buffer — attaching it on an 8‑bit source format is legal but
+  /// no writes occur.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn produces_rgb_u16(&self) -> bool {
+    self.rgb_u16.is_some()
+  }
+
   /// Returns `true` iff the sinker will write luma.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub const fn produces_luma(&self) -> bool {
@@ -338,6 +388,14 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> {
     Ok(self)
   }
 
+  // NOTE: `with_rgb_u16` / `set_rgb_u16` are **not** declared here.
+  // They live on a format‑specific impl block further down (currently
+  // [`MixedSinker<Yuv420p10>`]) so the buffer can only be attached to
+  // sink types whose `PixelSink` impl actually writes it. Attaching a
+  // `u16` RGB buffer to a [`Yuv420p`] / [`Nv12`] / [`Nv21`] sink is a
+  // compile error, not a silent stale‑state bug. Future high‑bit‑depth
+  // markers (12‑bit, 14‑bit, P010) will add their own impl blocks.
+
   /// Attaches a single-plane luma output buffer.
   /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`,
   /// or `Err(GeometryOverflow)` on 32‑bit overflow.
@@ -835,6 +893,221 @@ impl PixelSink for MixedSinker<'_, Nv21> {
   }
 }
 
+// ---- Yuv420p10 impl -----------------------------------------------------
+
+impl<'a> MixedSinker<'a, Yuv420p10> {
+  /// Attaches a packed **`u16`** RGB output buffer. Only available on
+  /// sinkers whose source format populates native‑depth `u16` RGB —
+  /// calling `with_rgb_u16` on an 8‑bit source sinker (e.g.
+  /// [`MixedSinker<Yuv420p>`]) is a compile error rather than a
+  /// silent no‑op that would leave the caller's buffer stale.
+  ///
+  /// Length is measured in `u16` **elements** (not bytes): minimum
+  /// `width × height × 3`. Each element carries a 10‑bit value in
+  /// the **low** 10 bits (upper 6 bits zero), matching FFmpeg's
+  /// `yuv420p10le` convention. This is **not** the `p010` layout
+  /// (which stores samples in the high 10 bits); callers feeding a
+  /// p010 consumer must shift the output left by 6.
+  ///
+  /// Returns `Err(RgbU16BufferTooShort)` if
+  /// `buf.len() < width × height × 3`, or `Err(GeometryOverflow)`
+  /// on 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgb_u16(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). The
+  /// required length is measured in `u16` **elements**, not bytes.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    // Packed RGB requires `width × height × 3` channel values —
+    // that's the same count whether the element type is `u8` or
+    // `u16`, so the [`Self::frame_bytes`] helper (named for the u8
+    // RGB path's byte count) gives the element count here too. No
+    // size conversion needed.
+    let expected_elements = self.frame_bytes(3)?;
+    if buf.len() < expected_elements {
+      return Err(MixedSinkerError::RgbU16BufferTooShort {
+        expected: expected_elements,
+        actual: buf.len(),
+      });
+    }
+    self.rgb_u16 = Some(buf);
+    Ok(self)
+  }
+}
+
+impl Yuv420p10Sink for MixedSinker<'_, Yuv420p10> {}
+
+impl PixelSink for MixedSinker<'_, Yuv420p10> {
+  type Input<'r> = Yuv420p10Row<'r>;
+  type Error = MixedSinkerError;
+
+  fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> {
+    if self.width & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: self.width });
+    }
+    check_dimensions_match(self.width, self.height, width, height)
+  }
+
+  fn process(&mut self, row: Yuv420p10Row<'_>) -> Result<(), Self::Error> {
+    // Bit depth is fixed by the format (10) — declared as a const so
+    // the downshift for u8 luma stays obvious at the call site.
+    const BITS: u32 = 10;
+
+    let w = self.width;
+    let h = self.height;
+    let idx = row.row();
+    let use_simd = self.simd;
+
+    // Defense in depth — see the [`Yuv420p`] impl for the rationale.
+    // Row slice checks use the 10‑bit variants of [`RowSlice`] so
+    // downstream log output disambiguates from the 8‑bit source impls.
+    if w & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: w });
+    }
+    if row.y().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::Y10,
+        row: idx,
+        expected: w,
+        actual: row.y().len(),
+      });
+    }
+    if row.u_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::UHalf10,
+        row: idx,
+        expected: w / 2,
+        actual: row.u_half().len(),
+      });
+    }
+    if row.v_half().len() != w / 2 {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::VHalf10,
+        row: idx,
+        expected: w / 2,
+        actual: row.v_half().len(),
+      });
+    }
+    if idx >= self.height {
+      return Err(MixedSinkerError::RowIndexOutOfRange {
+        row: idx,
+        configured_height: self.height,
+      });
+    }
+
+    let Self {
+      rgb,
+      rgb_u16,
+      luma,
+      hsv,
+      rgb_scratch,
+      ..
+    } = self;
+
+    let one_plane_start = idx * w;
+    let one_plane_end = one_plane_start + w;
+
+    // Luma: downshift 10‑bit Y to 8‑bit for the existing u8 luma
+    // buffer contract. Bit‑extension by `(BITS - 8)` preserves the
+    // most significant bits — functionally equivalent to FFmpeg's
+    // `>> (BITS - 8)` conversion used by many downstream analyses.
+    if let Some(luma) = luma.as_deref_mut() {
+      let dst = &mut luma[one_plane_start..one_plane_end];
+      for (d, &s) in dst.iter_mut().zip(row.y().iter()) {
+        *d = (s >> (BITS - 8)) as u8;
+      }
+    }
+
+    // `u16` RGB output — written directly via the native‑depth row
+    // primitive. Computed independently of the u8 path: the two
+    // outputs have different scale params inside `range_params_n`,
+    // so they can't share an intermediate without losing precision.
+    if let Some(buf) = rgb_u16.as_deref_mut() {
+      let rgb_plane_end =
+        one_plane_end
+          .checked_mul(3)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 3,
+          })?;
+      let rgb_plane_start = one_plane_start * 3;
+      yuv420p10_to_rgb_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        &mut buf[rgb_plane_start..rgb_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
+    let want_rgb = rgb.is_some();
+    let want_hsv = hsv.is_some();
+    if !want_rgb && !want_hsv {
+      return Ok(());
+    }
+
+    // 8‑bit RGB path — either writes to the caller's buffer (when
+    // `with_rgb` is set) or to the lazily‑grown scratch (when HSV is
+    // requested without RGB). Mirrors the 8‑bit source impls' layout.
+    let rgb_row: &mut [u8] = match rgb.as_deref_mut() {
+      Some(buf) => {
+        let rgb_plane_end =
+          one_plane_end
+            .checked_mul(3)
+            .ok_or(MixedSinkerError::GeometryOverflow {
+              width: w,
+              height: h,
+              channels: 3,
+            })?;
+        let rgb_plane_start = one_plane_start * 3;
+        &mut buf[rgb_plane_start..rgb_plane_end]
+      }
+      None => {
+        let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow {
+          width: w,
+          height: h,
+          channels: 3,
+        })?;
+        if rgb_scratch.len() < rgb_row_bytes {
+          rgb_scratch.resize(rgb_row_bytes, 0);
+        }
+        &mut rgb_scratch[..rgb_row_bytes]
+      }
+    };
+
+    yuv420p10_to_rgb_row(
+      row.y(),
+      row.u_half(),
+      row.v_half(),
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+    );
+
+    if let Some(hsv) = hsv.as_mut() {
+      rgb_to_hsv_row(
+        rgb_row,
+        &mut hsv.h[one_plane_start..one_plane_end],
+        &mut hsv.s[one_plane_start..one_plane_end],
+        &mut hsv.v[one_plane_start..one_plane_end],
+        w,
+        use_simd,
+      );
+    }
+    Ok(())
+  }
+}
+
 /// Returns `Ok(())` iff the walker's frame dimensions exactly match
 /// the sinker's configured dimensions. Called from
 /// [`PixelSink::begin_frame`] on both `MixedSinker<Yuv420p>` and
@@ -872,8 +1145,8 @@ mod tests {
   use super::*;
   use crate::{
     ColorMatrix,
-    frame::{Nv12Frame, Nv21Frame, Yuv420pFrame},
-    yuv::{nv12_to, nv21_to, yuv420p_to},
+    frame::{Nv12Frame, Nv21Frame, Yuv420p10Frame, Yuv420pFrame},
+    yuv::{nv12_to, nv21_to, yuv420p_to, yuv420p10_to},
   };
 
   fn solid_yuv420p_frame(
@@ -1739,4 +2012,162 @@ mod tests {
 
     assert_eq!(rgb_nv12, rgb_nv21);
   }
+
+  // ---- Yuv420p10 --------------------------------------------------------
+
+  fn solid_yuv420p10_frame(
+    width: u32,
+    height: u32,
+    y: u16,
+    u: u16,
+    v: u16,
+  ) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
+    let w = width as usize;
+    let h = height as usize;
+    let cw = w / 2;
+    let ch = h / 2;
+    (
+      std::vec![y; w * h],
+      std::vec![u; cw * ch],
+      std::vec![v; cw * ch],
+    )
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u8_only_gray_is_gray() {
+    // 10-bit mid-gray: Y=512, UV=512 → 8-bit RGB ≈ 128 on every channel.
+    let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u8; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_rgb(&mut rgb)
+      .unwrap();
+    yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+    }
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u16_only_native_depth_gray() {
+    // Same mid-gray frame → u16 RGB output in native 10-bit depth, so
+    // each channel should be ≈ 512 (the 10-bit mid).
+    let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .unwrap();
+    yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      // Upper 6 bits of each u16 must be zero — 10-bit convention.
+      assert!(px[0] <= 1023);
+    }
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u8_and_u16_both_populated() {
+    // 10-bit full-range white: Y=1023, UV=512. Both buffers should
+    // fill with their respective "white" values (255 for u8, 1023 for
+    // u16) in the same call.
+    let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 1023, 512, 512);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+    let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_rgb(&mut rgb_u8)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap();
+    yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(rgb_u8.iter().all(|&c| c == 255));
+    assert!(rgb_u16.iter().all(|&c| c == 1023));
+  }
+
+  #[test]
+  fn yuv420p10_luma_downshifts_to_8bit() {
+    // Y=512 at 10 bits → 512 >> 2 = 128 at 8 bits.
+    let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut luma = std::vec![0u8; 16 * 8];
+    let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_luma(&mut luma)
+      .unwrap();
+    yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(luma.iter().all(|&l| l == 128));
+  }
+
+  #[test]
+  fn yuv420p10_hsv_from_gray_is_zero_hue_zero_sat() {
+    // HSV derived from the internal u8 RGB scratch: neutral gray →
+    // H=0, S=0, V≈128. Exercises the "HSV without RGB" scratch path
+    // on the 10-bit source.
+    let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut h = std::vec![0xFFu8; 16 * 8];
+    let mut s = std::vec![0xFFu8; 16 * 8];
+    let mut v = std::vec![0xFFu8; 16 * 8];
+    let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_hsv(&mut h, &mut s, &mut v)
+      .unwrap();
+    yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(h.iter().all(|&b| b == 0));
+    assert!(s.iter().all(|&b| b == 0));
+    assert!(v.iter().all(|&b| b.abs_diff(128) <= 1));
+  }
+
+  #[test]
+  fn yuv420p10_rgb_u16_too_short_returns_err() {
+    let mut rgb = std::vec![0u16; 10]; // Way too small.
+    let err = MixedSinker::<Yuv420p10>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .err()
+      .unwrap();
+    assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. }));
+  }
+
+  #[test]
+  fn yuv420p10_with_simd_false_matches_with_simd_true() {
+    // The SIMD toggle exercises scalar-vs-SIMD dispatch. Both paths
+    // must produce byte-identical results on both outputs.
+    let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700);
+    let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32);
+
+    let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3];
+    let mut s_scalar = MixedSinker::<Yuv420p10>::new(64, 16)
+      .with_simd(false)
+      .with_rgb(&mut rgb_scalar)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_scalar)
+      .unwrap();
+    yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap();
+
+    let mut rgb_simd = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3];
+    let mut s_simd = MixedSinker::<Yuv420p10>::new(64, 16)
+      .with_rgb(&mut rgb_simd)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_simd)
+      .unwrap();
+    yuv420p10_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap();
+
+    assert_eq!(rgb_scalar, rgb_simd);
+    assert_eq!(rgb_u16_scalar, rgb_u16_simd);
+  }
 }
diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs
index ac2647e..655b706 100644
--- a/src/yuv/mod.rs
+++ b/src/yuv/mod.rs
@@ -8,13 +8,17 @@
 //!   default).
 //! - [`Nv21`](crate::yuv::Nv21) — 4:2:0 semi‑planar with **VU**-ordered
 //!   chroma (Android MediaCodec default).
+//! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits
+//!   per sample (HDR10 / 10‑bit SDR software decode).
 //!
 //! Other families land in follow-up commits.
 
 mod nv12;
 mod nv21;
 mod yuv420p;
+mod yuv420p10;
 
 pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to};
 pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to};
 pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to};
+pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to};
diff --git a/src/yuv/yuv420p10.rs b/src/yuv/yuv420p10.rs
new file mode 100644
index 0000000..1a85e06
--- /dev/null
+++ b/src/yuv/yuv420p10.rs
@@ -0,0 +1,170 @@
+//! YUV 4:2:0 planar 10‑bit (`AV_PIX_FMT_YUV420P10LE`).
+//!
+//! Storage mirrors [`super::Yuv420p`] — three planes, Y at full size
+//! plus U / V at half width and half height — but sample width is
+//! **`u16`** (10 active bits in the low bits of each element). The
+//! [`Yuv420p10Frame`] type alias pins the bit depth; the underlying
+//! [`Yuv420pFrame16`] struct is const‑generic over `BITS` so 12‑bit
+//! and 14‑bit variants can be added by relaxing its validator without
+//! changing kernel math.
+//!
+//! Ships in colconv v0.2 as the first high‑bit‑depth format (HDR /
+//! 10‑bit SDR keystone). Kernel semantics match [`super::Yuv420p`]:
+//! two consecutive Y rows share one chroma row (4:2:0), chroma is
+//! nearest‑neighbor upsampled in registers inside the row primitive.
+
+use crate::{
+  ColorMatrix, PixelSink, SourceFormat,
+  frame::{Yuv420p10Frame, Yuv420pFrame16},
+  sealed::Sealed,
+};
+
+/// Zero‑sized marker for the YUV 4:2:0 **10‑bit** source format. Used
+/// as the `F` type parameter on [`crate::sinker::MixedSinker`].
+///
+/// colconv v0.2 ships only the 10‑bit specialization; 12‑ and 14‑bit
+/// will arrive as separate markers (`Yuv420p12`, `Yuv420p14`) that
+/// refer to the same underlying [`Yuv420pFrame16`] struct with
+/// different `BITS` values.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct Yuv420p10;
+
+impl Sealed for Yuv420p10 {}
+impl SourceFormat for Yuv420p10 {}
+
+/// One output row of a 10‑bit YUV 4:2:0 source handed to a
+/// [`Yuv420p10Sink`]. Structurally identical to [`super::Yuv420pRow`],
+/// just `u16` samples.
+#[derive(Debug, Clone, Copy)]
+pub struct Yuv420p10Row<'a> {
+  y: &'a [u16],
+  u_half: &'a [u16],
+  v_half: &'a [u16],
+  row: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+}
+
+impl<'a> Yuv420p10Row<'a> {
+  /// Bundles one row of a 10‑bit 4:2:0 source for a [`Yuv420p10Sink`].
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  #[allow(clippy::too_many_arguments)]
+  pub(crate) fn new(
+    y: &'a [u16],
+    u_half: &'a [u16],
+    v_half: &'a [u16],
+    row: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) -> Self {
+    Self {
+      y,
+      u_half,
+      v_half,
+      row,
+      matrix,
+      full_range,
+    }
+  }
+
+  /// Full‑width Y (luma) row — `width` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Half‑width U (Cb) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn u_half(&self) -> &'a [u16] {
+    self.u_half
+  }
+
+  /// Half‑width V (Cr) row — `width / 2` `u16` samples.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn v_half(&self) -> &'a [u16] {
+    self.v_half
+  }
+
+  /// Output row index within the frame.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn row(&self) -> usize {
+    self.row
+  }
+
+  /// YUV → RGB matrix carried through from the kernel call.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn matrix(&self) -> ColorMatrix {
+    self.matrix
+  }
+
+  /// `true` iff Y uses the full sample range (`[0, 1023]` for 10‑bit);
+  /// `false` for limited range (`[64, 940]` luma, `[64, 960]` chroma).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn full_range(&self) -> bool {
+    self.full_range
+  }
+}
+
+/// Sinks that consume 10‑bit YUV 4:2:0 rows.
+pub trait Yuv420p10Sink: for<'a> PixelSink<Input<'a> = Yuv420p10Row<'a>> {}
+
+/// Converts a 10‑bit YUV 4:2:0 frame by walking its rows and feeding
+/// each one to the [`Yuv420p10Sink`]. See [`super::yuv420p_to`] for
+/// the shared design rationale — kernel is a pure row walker, all
+/// color arithmetic happens inside the Sink via the crate's row
+/// primitives.
+pub fn yuv420p10_to<S: Yuv420p10Sink>(
+  src: &Yuv420p10Frame<'_>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  // `BITS` is pinned at the const generic (10) so the walker body
+  // can be monomorphized per bit depth later; the row and sink types
+  // themselves are still 10‑bit only (`Yuv420p10Row` / `Yuv420p10Sink`).
+  // 12‑ and 14‑bit support will add their own marker / row / sink
+  // trios plus per‑depth walker entry points.
+  yuv420p10_walker::<10, S>(src, full_range, matrix, sink)
+}
+
+/// Row walker for the 10‑bit YUV 4:2:0 source. `BITS` is a const
+/// generic so [`Yuv420pFrame16<BITS>`] geometry reads (stride, plane
+/// slicing) are monomorphized; the row/sink types bound below are
+/// still pinned to the 10‑bit variants — 12 / 14 will grow their own
+/// walkers alongside their own marker types.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn yuv420p10_walker<const BITS: u32, S: Yuv420p10Sink>(
+  src: &Yuv420pFrame16<'_, BITS>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  sink.begin_frame(src.width(), src.height())?;
+
+  let w = src.width() as usize;
+  let h = src.height() as usize;
+  let y_stride = src.y_stride() as usize;
+  let u_stride = src.u_stride() as usize;
+  let v_stride = src.v_stride() as usize;
+  let chroma_width = w / 2;
+
+  let y_plane = src.y();
+  let u_plane = src.u();
+  let v_plane = src.v();
+
+  for row in 0..h {
+    let y_start = row * y_stride;
+    let y = &y_plane[y_start..y_start + w];
+
+    let chroma_row = row / 2;
+    let u_start = chroma_row * u_stride;
+    let v_start = chroma_row * v_stride;
+    let u_half = &u_plane[u_start..u_start + chroma_width];
+    let v_half = &v_plane[v_start..v_start + chroma_width];
+
+    sink.process(Yuv420p10Row::new(
+      y, u_half, v_half, row, matrix, full_range,
+    ))?;
+  }
+  Ok(())
+}