diff --git a/Cargo.toml b/Cargo.toml
index 2f34a10..458d138 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,10 @@ harness = false
 name = "yuv_420p10_to_rgb"
 harness = false
 
+[[bench]]
+name = "p010_to_rgb"
+harness = false
+
 [[bench]]
 name = "rgb_to_hsv"
 harness = false
diff --git a/benches/p010_to_rgb.rs b/benches/p010_to_rgb.rs
new file mode 100644
index 0000000..6466b64
--- /dev/null
+++ b/benches/p010_to_rgb.rs
@@ -0,0 +1,106 @@
+//! Per‑row P010 (semi‑planar 4:2:0, 10‑bit, high‑bit‑packed) → RGB
+//! throughput baseline.
+//!
+//! Mirrors [`yuv_420p10_to_rgb`] — two output paths per width:
+//! - `u8_*` — P010 → packed 8‑bit RGB (hot path for scene / keyframe
+//!   detection).
+//! - `u16_*` — P010 → native‑depth 10‑bit RGB in `u16` storage
+//!   (lossless, for HDR tone mapping).
+//!
+//! Each width gets a `scalar` vs `simd` pair so the SIMD speedup on
+//! whichever backend the dispatcher selects is a two‑line comparison
+//! in the Criterion report.
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use std::hint::black_box;
+
+use colconv::{
+  ColorMatrix,
+  row::{p010_to_rgb_row, p010_to_rgb_u16_row},
+};
+
+/// Fills a `u16` buffer with a deterministic P010‑packed pseudo‑random
+/// sequence — 10‑bit values shifted into the high 10 bits of each
+/// `u16` (low 6 bits zero), matching the real P010 storage layout.
+fn fill_pseudo_random_p010(buf: &mut [u16], seed: u32) {
+  let mut state = seed;
+  for b in buf {
+    state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+    *b = (((state >> 8) & 0x3FF) as u16) << 6;
+  }
+}
+
+fn bench(c: &mut Criterion) {
+  // 720p / 1080p / 4K widths — multiples of 64 so the widest SIMD
+  // tier (AVX‑512, 64 pixels per iteration) covers each block fully.
+  const WIDTHS: &[usize] = &[1280, 1920, 3840];
+  const MATRIX: ColorMatrix = ColorMatrix::Bt2020Ncl;
+  const FULL_RANGE: bool = false;
+
+  // ---- u8 output ------------------------------------------------------
+  let mut group_u8 = c.benchmark_group("p010_to_rgb_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    // UV row payload is `width` u16 elements (w / 2 interleaved pairs).
+    let mut uv = std::vec![0u16; w];
+    fill_pseudo_random_p010(&mut y, 0x1111);
+    fill_pseudo_random_p010(&mut uv, 0x2222);
+    let mut rgb = std::vec![0u8; w * 3];
+
+    group_u8.throughput(Throughput::Bytes((w * 3) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u8_simd" } else { "u8_scalar" };
+      group_u8.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          p010_to_rgb_row(
+            black_box(&y),
+            black_box(&uv),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u8.finish();
+
+  // ---- u16 native-depth output ----------------------------------------
+  let mut group_u16 = c.benchmark_group("p010_to_rgb_u16_row");
+
+  for &w in WIDTHS {
+    let mut y = std::vec![0u16; w];
+    let mut uv = std::vec![0u16; w];
+    fill_pseudo_random_p010(&mut y, 0x1111);
+    fill_pseudo_random_p010(&mut uv, 0x2222);
+    let mut rgb = std::vec![0u16; w * 3];
+
+    // u16 output writes 2× the bytes of u8.
+    group_u16.throughput(Throughput::Bytes((w * 3 * 2) as u64));
+
+    for use_simd in [false, true] {
+      let label = if use_simd { "u16_simd" } else { "u16_scalar" };
+      group_u16.bench_with_input(BenchmarkId::new(label, w), &w, |b, &w| {
+        b.iter(|| {
+          p010_to_rgb_u16_row(
+            black_box(&y),
+            black_box(&uv),
+            black_box(&mut rgb),
+            w,
+            MATRIX,
+            FULL_RANGE,
+            use_simd,
+          );
+        });
+      });
+    }
+  }
+  group_u16.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/src/frame.rs b/src/frame.rs
index 585d59f..8becc63 100644
--- a/src/frame.rs
+++ b/src/frame.rs
@@ -463,6 +463,369 @@ pub enum Nv12FrameError {
   },
 }
 
+/// A validated P010 (semi‑planar 4:2:0, 10‑bit `u16`) frame.
+///
+/// The canonical layout emitted by Apple VideoToolbox, VA‑API, NVDEC,
+/// D3D11VA, and Intel QSV for 10‑bit HDR hardware‑decoded output. Same
+/// plane shape as [`Nv12Frame`] — one full‑size luma plane plus one
+/// interleaved UV plane at half width and half height — but sample
+/// width is **`u16`** and the 10 active bits sit in the **high** 10 of
+/// each element (`sample = value << 6`, low 6 bits zero). That matches
+/// Microsoft's P010 convention and FFmpeg's `AV_PIX_FMT_P010LE`.
+///
+/// This is **not** the [`Yuv420p10Frame`] layout — yuv420p10le puts the
+/// 10 bits in the **low** 10 of each `u16`. Callers holding a P010
+/// buffer must use [`P010Frame`]; callers holding yuv420p10le must use
+/// [`Yuv420p10Frame`]. Kernels mask/shift appropriately for each.
+///
+/// Stride is in **samples** (`u16` elements), not bytes. Users holding
+/// an FFmpeg byte buffer should cast via [`bytemuck::cast_slice`] and
+/// divide `linesize[i]` by 2 before constructing.
+///
+/// Two planes:
+/// - `y` — full‑size luma, `y_stride >= width`, length
+///   `>= y_stride * height` (all in `u16` samples).
+/// - `uv` — interleaved chroma (`U0, V0, U1, V1, …`) at half width and
+///   half height, so each UV row carries `2 * ceil(width / 2) = width`
+///   `u16` elements; `uv_stride >= width`, length
+///   `>= uv_stride * ceil(height / 2)`.
+///
+/// `width` must be even (same 4:2:0 rationale as the other frame
+/// types); `height` may be odd (handled via `height.div_ceil(2)` in
+/// chroma‑row sizing).
+///
+/// # Input sample range and packing sanity
+///
+/// Each `u16` sample's 10 active bits live in the high 10 positions;
+/// the low 6 bits are expected to be zero. [`Self::try_new`] validates
+/// geometry only.
+///
+/// [`Self::try_new_checked`] additionally scans every sample and
+/// rejects any with non‑zero low 6 bits — a **necessary but not
+/// sufficient** packing sanity check. It catches mispacked
+/// `yuv420p10le` buffers as long as **at least one** sample has
+/// low‑bit content (the usual case for noisy real‑world image data),
+/// but it **cannot distinguish** P010 from a `yuv420p10le` buffer
+/// whose samples all happen to be multiples of 64. Values like
+/// `Y = 64` (limited‑range black) and `UV = 512` (neutral chroma)
+/// both have low 6 bits zero and so pass the check, even though the
+/// buffer layout is wrong. For strict provenance, callers must rely
+/// on their source format metadata and pick the right frame type
+/// ([`P010Frame`] vs [`Yuv420p10Frame`]) at construction.
+///
+/// Kernels shift each load right by 6 to extract the 10‑bit value,
+/// so mispacked input (e.g. a `yuv420p10le` buffer handed to the
+/// P010 kernel) produces deterministic, backend‑independent output
+/// — wrong colors, but consistently wrong across scalar + every
+/// SIMD backend, which is visible in any output diff.
+#[derive(Debug, Clone, Copy)]
+pub struct P010Frame<'a> {
+  y: &'a [u16],
+  uv: &'a [u16],
+  width: u32,
+  height: u32,
+  y_stride: u32,
+  uv_stride: u32,
+}
+
+impl<'a> P010Frame<'a> {
+  /// Constructs a new [`P010Frame`], validating dimensions and plane
+  /// lengths. Strides are in `u16` **samples**.
+  ///
+  /// Returns [`P010FrameError`] if any of:
+  /// - `width` or `height` is zero,
+  /// - `width` is odd,
+  /// - `y_stride < width`,
+  /// - `uv_stride < width` (the UV row holds `width / 2` interleaved
+  ///   pairs = `width` `u16` elements),
+  /// - either plane is too short, or
+  /// - `stride * rows` overflows `usize` (32‑bit targets only).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn try_new(
+    y: &'a [u16],
+    uv: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    uv_stride: u32,
+  ) -> Result<Self, P010FrameError> {
+    if width == 0 || height == 0 {
+      return Err(P010FrameError::ZeroDimension { width, height });
+    }
+    if width & 1 != 0 {
+      return Err(P010FrameError::OddWidth { width });
+    }
+    if y_stride < width {
+      return Err(P010FrameError::YStrideTooSmall { width, y_stride });
+    }
+    let uv_row_elems = width;
+    if uv_stride < uv_row_elems {
+      return Err(P010FrameError::UvStrideTooSmall {
+        uv_row_elems,
+        uv_stride,
+      });
+    }
+
+    let y_min = match (y_stride as usize).checked_mul(height as usize) {
+      Some(v) => v,
+      None => {
+        return Err(P010FrameError::GeometryOverflow {
+          stride: y_stride,
+          rows: height,
+        });
+      }
+    };
+    if y.len() < y_min {
+      return Err(P010FrameError::YPlaneTooShort {
+        expected: y_min,
+        actual: y.len(),
+      });
+    }
+    let chroma_height = height.div_ceil(2);
+    let uv_min = match (uv_stride as usize).checked_mul(chroma_height as usize) {
+      Some(v) => v,
+      None => {
+        return Err(P010FrameError::GeometryOverflow {
+          stride: uv_stride,
+          rows: chroma_height,
+        });
+      }
+    };
+    if uv.len() < uv_min {
+      return Err(P010FrameError::UvPlaneTooShort {
+        expected: uv_min,
+        actual: uv.len(),
+      });
+    }
+
+    Ok(Self {
+      y,
+      uv,
+      width,
+      height,
+      y_stride,
+      uv_stride,
+    })
+  }
+
+  /// Constructs a new [`P010Frame`], panicking on invalid inputs.
+  /// Prefer [`Self::try_new`] when inputs may be invalid at runtime.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn new(
+    y: &'a [u16],
+    uv: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    uv_stride: u32,
+  ) -> Self {
+    match Self::try_new(y, uv, width, height, y_stride, uv_stride) {
+      Ok(frame) => frame,
+      Err(_) => panic!("invalid P010Frame dimensions or plane lengths"),
+    }
+  }
+
+  /// Like [`Self::try_new`] but additionally scans every sample and
+  /// rejects any whose **low 6 bits** are non‑zero. A valid P010
+  /// sample has its 10 active bits in the high 10 positions and zero
+  /// below, so non‑zero low bits is evidence the buffer isn't P010.
+  ///
+  /// **This is a packing sanity check, not a provenance validator.**
+  /// The check catches noisy `yuv420p10le` data (where most samples
+  /// have low‑bit content), but it **cannot** distinguish P010 from
+  /// a `yuv420p10le` buffer whose samples all happen to be multiples
+  /// of 64. Common flat‑region values like `Y = 64` (limited‑range
+  /// black) or `UV = 512` (neutral chroma) are multiples of 64 in
+  /// both layouts, so a yuv420p10le buffer of flat content will
+  /// silently pass this check. Callers who need strict provenance
+  /// must rely on their source format metadata and pick the right
+  /// frame type at construction ([`P010Frame`] vs [`Yuv420p10Frame`]);
+  /// no runtime check on opaque `u16` data can reliably tell the two
+  /// layouts apart.
+  ///
+  /// Cost: one O(plane_size) scan per plane. The default
+  /// [`Self::try_new`] skips this so the hot path stays O(1).
+  ///
+  /// Returns [`P010FrameError::SampleLowBitsSet`] on the first
+  /// offending sample — carries the plane, element index, and
+  /// offending value.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn try_new_checked(
+    y: &'a [u16],
+    uv: &'a [u16],
+    width: u32,
+    height: u32,
+    y_stride: u32,
+    uv_stride: u32,
+  ) -> Result<Self, P010FrameError> {
+    let frame = Self::try_new(y, uv, width, height, y_stride, uv_stride)?;
+    let w = width as usize;
+    let h = height as usize;
+    let uv_w = w; // interleaved: `width / 2` pairs × 2 elements
+    let chroma_h = height.div_ceil(2) as usize;
+    for row in 0..h {
+      let start = row * y_stride as usize;
+      for (col, &s) in y[start..start + w].iter().enumerate() {
+        if s & 0x3F != 0 {
+          return Err(P010FrameError::SampleLowBitsSet {
+            plane: P010FramePlane::Y,
+            index: start + col,
+            value: s,
+          });
+        }
+      }
+    }
+    for row in 0..chroma_h {
+      let start = row * uv_stride as usize;
+      for (col, &s) in uv[start..start + uv_w].iter().enumerate() {
+        if s & 0x3F != 0 {
+          return Err(P010FrameError::SampleLowBitsSet {
+            plane: P010FramePlane::Uv,
+            index: start + col,
+            value: s,
+          });
+        }
+      }
+    }
+    Ok(frame)
+  }
+
+  /// Y (luma) plane samples. Row `r` starts at sample offset
+  /// `r * y_stride()`. Each sample's 10 active bits sit in the **high**
+  /// 10 positions of the `u16` (low 6 bits zero).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Interleaved UV plane samples. Each chroma row starts at sample
+  /// offset `chroma_row * uv_stride()` and contains `width` `u16`
+  /// elements laid out as `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`.
+  /// Each element's 10 active bits sit in the high 10 positions.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn uv(&self) -> &'a [u16] {
+    self.uv
+  }
+
+  /// Frame width in pixels. Always even.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn width(&self) -> u32 {
+    self.width
+  }
+
+  /// Frame height in pixels.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn height(&self) -> u32 {
+    self.height
+  }
+
+  /// Sample stride of the Y plane (`>= width`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn y_stride(&self) -> u32 {
+    self.y_stride
+  }
+
+  /// Sample stride of the interleaved UV plane (`>= width`).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn uv_stride(&self) -> u32 {
+    self.uv_stride
+  }
+}
+
+/// Identifies which plane of a [`P010Frame`] a
+/// [`P010FrameError::SampleLowBitsSet`] refers to.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
+pub enum P010FramePlane {
+  /// Luma plane.
+  Y,
+  /// Interleaved UV plane.
+  Uv,
+}
+
+/// Errors returned by [`P010Frame::try_new`] and
+/// [`P010Frame::try_new_checked`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IsVariant, Error)]
+#[non_exhaustive]
+pub enum P010FrameError {
+  /// `width` or `height` was zero.
+  #[error("width ({width}) or height ({height}) is zero")]
+  ZeroDimension {
+    /// The supplied width.
+    width: u32,
+    /// The supplied height.
+    height: u32,
+  },
+  /// `width` was odd. Same 4:2:0 rationale as the other semi‑planar
+  /// formats.
+  #[error("width ({width}) is odd; 4:2:0 requires even width")]
+  OddWidth {
+    /// The supplied width.
+    width: u32,
+  },
+  /// `y_stride < width` (in `u16` samples).
+  #[error("y_stride ({y_stride}) is smaller than width ({width})")]
+  YStrideTooSmall {
+    /// Declared frame width in pixels.
+    width: u32,
+    /// The supplied Y‑plane stride (samples).
+    y_stride: u32,
+  },
+  /// `uv_stride` is smaller than the `width` `u16` elements of
+  /// interleaved UV payload one chroma row must hold.
+  #[error("uv_stride ({uv_stride}) is smaller than UV row payload ({uv_row_elems} u16 elements)")]
+  UvStrideTooSmall {
+    /// Required minimum UV‑plane stride (`= width`).
+    uv_row_elems: u32,
+    /// The supplied UV‑plane stride (samples).
+    uv_stride: u32,
+  },
+  /// Y plane is shorter than `y_stride * height` samples.
+  #[error("Y plane has {actual} samples but at least {expected} are required")]
+  YPlaneTooShort {
+    /// Minimum samples required.
+    expected: usize,
+    /// Actual samples supplied.
+    actual: usize,
+  },
+  /// UV plane is shorter than `uv_stride * ceil(height / 2)` samples.
+  #[error("UV plane has {actual} samples but at least {expected} are required")]
+  UvPlaneTooShort {
+    /// Minimum samples required.
+    expected: usize,
+    /// Actual samples supplied.
+    actual: usize,
+  },
+  /// `stride * rows` overflows `usize` (32‑bit targets only).
+  #[error("declared geometry overflows usize: stride={stride} * rows={rows}")]
+  GeometryOverflow {
+    /// Stride of the plane whose size overflowed.
+    stride: u32,
+    /// Row count that overflowed against the stride.
+    rows: u32,
+  },
+  /// A sample's low 6 bits were non‑zero — P010 packs its 10 active
+  /// bits in the high 10 of each `u16`, so valid samples are always
+  /// multiples of 64 (`value << 6`). Only
+  /// [`P010Frame::try_new_checked`] can produce this error.
+  ///
+  /// Note: the absence of this error does **not** prove the buffer
+  /// is P010. A `yuv420p10le` buffer of samples that all happen to
+  /// be multiples of 64 (e.g. `Y = 64`, `UV = 512`) passes the
+  /// check silently. See [`P010Frame::try_new_checked`] for the
+  /// full discussion.
+  #[error(
+    "sample {value:#06x} on plane {plane} at element {index} has non-zero low 6 bits (not a valid P010 sample)"
+  )]
+  SampleLowBitsSet {
+    /// Which plane the offending sample lives on.
+    plane: P010FramePlane,
+    /// Element index within that plane's slice.
+    index: usize,
+    /// The offending sample value.
+    value: u16,
+  },
+}
+
 /// A validated NV21 (semi‑planar 4:2:0) frame.
 ///
 /// Structurally identical to [`Nv12Frame`] — one full-size luma plane
@@ -1764,4 +2127,177 @@ mod tests {
     let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err();
     assert!(matches!(e, Yuv420pFrame16Error::YPlaneTooShort { .. }));
   }
+
+  // ---- P010Frame ---------------------------------------------------------
+  //
+  // Semi‑planar 10‑bit. Plane shape mirrors Nv12Frame (Y + interleaved
+  // UV) but sample width is `u16` with the 10 active bits in the
+  // **high** 10 of each element (`value << 6`). Strides are in
+  // samples, not bytes.
+
+  fn p010_planes() -> (std::vec::Vec<u16>, std::vec::Vec<u16>) {
+    // 16×8 frame — UV plane carries 16 u16 × 4 chroma rows = 64 u16.
+    // P010 white Y = 1023 << 6 = 0xFFC0; neutral UV = 512 << 6 = 0x8000.
+    (std::vec![0xFFC0u16; 16 * 8], std::vec![0x8000u16; 16 * 4])
+  }
+
+  #[test]
+  fn p010_try_new_accepts_valid_tight() {
+    let (y, uv) = p010_planes();
+    let f = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).expect("valid");
+    assert_eq!(f.width(), 16);
+    assert_eq!(f.height(), 8);
+    assert_eq!(f.uv_stride(), 16);
+  }
+
+  #[test]
+  fn p010_try_new_accepts_odd_height() {
+    // 640×481 — same concrete odd‑height case covered by NV12 / NV21.
+    let y = std::vec![0u16; 640 * 481];
+    let uv = std::vec![0x8000u16; 640 * 241];
+    let f = P010Frame::try_new(&y, &uv, 640, 481, 640, 640).expect("odd height valid");
+    assert_eq!(f.height(), 481);
+  }
+
+  #[test]
+  fn p010_try_new_rejects_odd_width() {
+    let (y, uv) = p010_planes();
+    let e = P010Frame::try_new(&y, &uv, 15, 8, 16, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::OddWidth { width: 15 }));
+  }
+
+  #[test]
+  fn p010_try_new_rejects_zero_dim() {
+    let (y, uv) = p010_planes();
+    let e = P010Frame::try_new(&y, &uv, 0, 8, 16, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::ZeroDimension { .. }));
+  }
+
+  #[test]
+  fn p010_try_new_rejects_y_stride_under_width() {
+    let (y, uv) = p010_planes();
+    let e = P010Frame::try_new(&y, &uv, 16, 8, 8, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::YStrideTooSmall { .. }));
+  }
+
+  #[test]
+  fn p010_try_new_rejects_uv_stride_under_width() {
+    let (y, uv) = p010_planes();
+    let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 8).unwrap_err();
+    assert!(matches!(e, P010FrameError::UvStrideTooSmall { .. }));
+  }
+
+  #[test]
+  fn p010_try_new_rejects_short_y_plane() {
+    let y = std::vec![0u16; 10];
+    let uv = std::vec![0x8000u16; 16 * 4];
+    let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::YPlaneTooShort { .. }));
+  }
+
+  #[test]
+  fn p010_try_new_rejects_short_uv_plane() {
+    let y = std::vec![0u16; 16 * 8];
+    let uv = std::vec![0x8000u16; 8];
+    let e = P010Frame::try_new(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::UvPlaneTooShort { .. }));
+  }
+
+  #[test]
+  #[should_panic(expected = "invalid P010Frame")]
+  fn p010_new_panics_on_invalid() {
+    let y = std::vec![0u16; 10];
+    let uv = std::vec![0x8000u16; 16 * 4];
+    let _ = P010Frame::new(&y, &uv, 16, 8, 16, 16);
+  }
+
+  #[cfg(target_pointer_width = "32")]
+  #[test]
+  fn p010_try_new_rejects_geometry_overflow() {
+    let big: u32 = 0x1_0000;
+    let y: [u16; 0] = [];
+    let uv: [u16; 0] = [];
+    let e = P010Frame::try_new(&y, &uv, big, big, big, big).unwrap_err();
+    assert!(matches!(e, P010FrameError::GeometryOverflow { .. }));
+  }
+
+  #[test]
+  fn p010_try_new_checked_accepts_shifted_samples() {
+    // Valid P010 samples: low 6 bits zero.
+    let (y, uv) = p010_planes();
+    P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).expect("shifted samples valid");
+  }
+
+  #[test]
+  fn p010_try_new_checked_rejects_y_low_bits_set() {
+    // A Y sample with low 6 bits set — characteristic of yuv420p10le
+    // packing (value in low 10 bits) accidentally handed to the P010
+    // constructor. `try_new_checked` catches this; plain `try_new`
+    // would let the kernel mask it down and produce wrong colors.
+    let mut y = std::vec![0xFFC0u16; 16 * 8];
+    y[3 * 16 + 5] = 0x03FF; // 10-bit value in low bits — wrong packing
+    let uv = std::vec![0x8000u16; 16 * 4];
+    let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    match e {
+      P010FrameError::SampleLowBitsSet { plane, value, .. } => {
+        assert_eq!(plane, P010FramePlane::Y);
+        assert_eq!(value, 0x03FF);
+      }
+      other => panic!("expected SampleLowBitsSet, got {other:?}"),
+    }
+  }
+
+  #[test]
+  fn p010_try_new_checked_rejects_uv_plane_sample() {
+    let y = std::vec![0xFFC0u16; 16 * 8];
+    let mut uv = std::vec![0x8000u16; 16 * 4];
+    uv[2 * 16 + 3] = 0x0001; // low bit set
+    let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    assert!(matches!(
+      e,
+      P010FrameError::SampleLowBitsSet {
+        plane: P010FramePlane::Uv,
+        value: 0x0001,
+        ..
+      }
+    ));
+  }
+
+  #[test]
+  fn p010_try_new_checked_reports_geometry_errors_first() {
+    let y = std::vec![0u16; 10]; // Too small.
+    let uv = std::vec![0x8000u16; 16 * 4];
+    let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err();
+    assert!(matches!(e, P010FrameError::YPlaneTooShort { .. }));
+  }
+
+  /// Regression documenting a **known limitation** of
+  /// [`P010Frame::try_new_checked`]: the low‑6‑bits‑zero check is a
+  /// packing sanity check, not a provenance validator. A
+  /// `yuv420p10le` buffer whose samples all happen to be multiples
+  /// of 64 — e.g. `Y = 64` (limited‑range black, `0x0040`) and
+  /// `UV = 512` (neutral chroma, `0x0200`) — passes the check
+  /// silently, even though the layout is wrong and downstream P010
+  /// kernels will produce incorrect output.
+  ///
+  /// The test asserts the check accepts these values so the limit
+  /// is visible in the test log; any future attempt to tighten the
+  /// constructor into a real provenance validator will need to
+  /// update or replace this test.
+  #[test]
+  fn p010_try_new_checked_accepts_ambiguous_yuv420p10le_samples() {
+    // `yuv420p10le`-style samples, all multiples of 64: low 6 bits
+    // are zero, so they pass the P010 sanity check even though this
+    // is wrong data for a P010 frame.
+    let y = std::vec![0x0040u16; 16 * 8]; // limited-range black in 10-bit low-packed
+    let uv = std::vec![0x0200u16; 16 * 4]; // neutral chroma in 10-bit low-packed
+    let f = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16)
+      .expect("known limitation: low-6-bits-zero check cannot tell yuv420p10le from P010");
+    assert_eq!(f.width(), 16);
+    // Downstream decoding of this frame would produce wrong colors
+    // (every `>> 6` extracts 1 from Y=0x0040 and 8 from UV=0x0200,
+    // which P010 kernels then bias/scale as if those were the 10-bit
+    // source values). That's accepted behavior — the type system,
+    // not `try_new_checked`, is what keeps yuv420p10le out of P010.
+  }
 }
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 5cefb9e..f98d9cd 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -37,10 +37,10 @@ use core::arch::aarch64::{
   vaddq_s32, vandq_u16, vbslq_f32, vceqq_f32, vcltq_f32, vcombine_s16, vcombine_u8, vcombine_u16,
   vcvtq_f32_u32, vcvtq_u32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16, vdupq_n_s32, vdupq_n_u16,
   vget_high_s16, vget_high_u8, vget_high_u16, vget_low_s16, vget_low_u8, vget_low_u16, vld1_u8,
-  vld1q_u8, vld1q_u16, vld2_u8, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16, vmovl_s16,
-  vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32, vqaddq_s16,
-  vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32, vst1q_u8,
-  vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
+  vld1q_u8, vld1q_u16, vld2_u8, vld2q_u16, vld3q_u8, vmaxq_f32, vmaxq_s16, vminq_f32, vminq_s16,
+  vmovl_s16, vmovl_u8, vmovl_u16, vmovn_u16, vmovn_u32, vmulq_f32, vmulq_s32, vmvnq_u32,
+  vqaddq_s16, vqmovn_s32, vqmovun_s16, vreinterpretq_s16_u16, vreinterpretq_u16_s16, vshrq_n_s32,
+  vshrq_n_u16, vst1q_u8, vst3q_u8, vst3q_u16, vsubq_f32, vsubq_s16, vzip1q_s16, vzip2q_s16,
 };
 
 use crate::{ColorMatrix, row::scalar};
@@ -488,6 +488,267 @@ fn clamp_u10(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_t {
   unsafe { vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(v, zero_v), max_v)) }
 }
 
+/// NEON P010 → packed **8‑bit** RGB.
+///
+/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences
+/// from [`yuv420p10_to_rgb_row`]:
+/// - UV is semi‑planar interleaved (`U0, V0, U1, V1, …`), split in
+///   one shot via `vld2q_u16` (returns separate U and V vectors).
+/// - Each `u16` load is **shifted right by 6** (`vshrq_n_u16::<6>`)
+///   instead of AND‑masked — P010 packs its 10 active bits in the
+///   HIGH 10 of each `u16`, so `>> 6` extracts the value and
+///   simultaneously clears the low 6 bits (which the format mandates
+///   are zero anyway; the shift makes mispacked input deterministic).
+/// - Chroma bias is 512 (10‑bit center) after the shift.
+///
+/// After the shift, the rest of the pipeline is identical to the
+/// `yuv420p10` path — same `chroma_i16x8` / `scale_y` / `chroma_dup`
+/// / `vst3q_u8` write, with `range_params_n::<10, 8>` scaling.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: NEON availability is the caller's obligation.
+  unsafe {
+    let rnd_v = vdupq_n_s32(RND);
+    let y_off_v = vdupq_n_s16(y_off as i16);
+    let y_scale_v = vdupq_n_s32(y_scale);
+    let c_scale_v = vdupq_n_s32(c_scale);
+    let bias_v = vdupq_n_s16(bias as i16);
+    let cru = vdupq_n_s32(coeffs.r_u());
+    let crv = vdupq_n_s32(coeffs.r_v());
+    let cgu = vdupq_n_s32(coeffs.g_u());
+    let cgv = vdupq_n_s32(coeffs.g_v());
+    let cbu = vdupq_n_s32(coeffs.b_u());
+    let cbv = vdupq_n_s32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // 16 Y pixels in two u16x8 loads, shifted right by 6 to extract
+      // the 10‑bit values from P010's high‑bit packing.
+      let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x)));
+      let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8)));
+
+      // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements
+      // and returns (evens, odds) = (U, V) in one shot. Each gets the
+      // same `>> 6` shift as Y.
+      let uv_pair = vld2q_u16(uv_half.as_ptr().add(x));
+      let u_vec = vshrq_n_u16::<6>(uv_pair.0);
+      let v_vec = vshrq_n_u16::<6>(uv_pair.1);
+
+      let y_lo = vreinterpretq_s16_u16(y_vec_lo);
+      let y_hi = vreinterpretq_s16_u16(y_vec_hi);
+
+      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = vzip1q_s16(r_chroma, r_chroma);
+      let r_dup_hi = vzip2q_s16(r_chroma, r_chroma);
+      let g_dup_lo = vzip1q_s16(g_chroma, g_chroma);
+      let g_dup_hi = vzip2q_s16(g_chroma, g_chroma);
+      let b_dup_lo = vzip1q_s16(b_chroma, b_chroma);
+      let b_dup_hi = vzip2q_s16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v);
+
+      let b_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, b_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)),
+      );
+      let g_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, g_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, g_dup_hi)),
+      );
+      let r_u8 = vcombine_u8(
+        vqmovun_s16(vqaddq_s16(y_scaled_lo, r_dup_lo)),
+        vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
+      );
+
+      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// NEON P010 → packed **10‑bit `u16`** RGB (native‑depth, low‑bit‑
+/// packed output — `yuv420p10le` convention, not P010).
+///
+/// Same structure as [`p010_to_rgb_row`] up to the chroma compute;
+/// the only differences are:
+/// - `range_params_n::<10, 10>` → larger scales targeting the 10‑bit
+///   output range.
+/// - Clamp is explicit min/max to `[0, 1023]` via
+///   [`clamp_u10`](crate::row::arch::neon::clamp_u10).
+/// - Writes use two `vst3q_u16` calls per 16‑pixel block.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: NEON availability is the caller's obligation.
+  unsafe {
+    let rnd_v = vdupq_n_s32(RND);
+    let y_off_v = vdupq_n_s16(y_off as i16);
+    let y_scale_v = vdupq_n_s32(y_scale);
+    let c_scale_v = vdupq_n_s32(c_scale);
+    let bias_v = vdupq_n_s16(bias as i16);
+    let max_v = vdupq_n_s16(OUT_MAX_10);
+    let zero_v = vdupq_n_s16(0);
+    let cru = vdupq_n_s32(coeffs.r_u());
+    let crv = vdupq_n_s32(coeffs.r_v());
+    let cgu = vdupq_n_s32(coeffs.g_u());
+    let cgv = vdupq_n_s32(coeffs.g_v());
+    let cbu = vdupq_n_s32(coeffs.b_u());
+    let cbv = vdupq_n_s32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      let y_vec_lo = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x)));
+      let y_vec_hi = vshrq_n_u16::<6>(vld1q_u16(y.as_ptr().add(x + 8)));
+      let uv_pair = vld2q_u16(uv_half.as_ptr().add(x));
+      let u_vec = vshrq_n_u16::<6>(uv_pair.0);
+      let v_vec = vshrq_n_u16::<6>(uv_pair.1);
+
+      let y_lo = vreinterpretq_s16_u16(y_vec_lo);
+      let y_hi = vreinterpretq_s16_u16(y_vec_hi);
+
+      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = vzip1q_s16(r_chroma, r_chroma);
+      let r_dup_hi = vzip2q_s16(r_chroma, r_chroma);
+      let g_dup_lo = vzip1q_s16(g_chroma, g_chroma);
+      let g_dup_hi = vzip2q_s16(g_chroma, g_chroma);
+      let b_dup_lo = vzip1q_s16(b_chroma, b_chroma);
+      let b_dup_hi = vzip2q_s16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_lo, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_hi, y_off_v, y_scale_v, rnd_v);
+
+      let r_lo = clamp_u10(vqaddq_s16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10(vqaddq_s16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10(vqaddq_s16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10(vqaddq_s16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
+      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_u16_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
 /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the
 /// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1619,4 +1880,183 @@ mod tests {
       }
     }
   }
+
+  // ---- P010 NEON scalar-equivalence --------------------------------------
+
+  /// P010 test samples: 10‑bit values shifted into the high 10 bits
+  /// (`value << 6`). Deterministic pseudo‑random generator keyed by
+  /// index × seed so U, V, Y vectors are mutually distinct.
+  fn p010_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6)
+      .collect()
+  }
+
+  /// Interleaves per‑pair U, V samples into P010's semi‑planar UV
+  /// layout: `[U0, V0, U1, V1, …]`.
+  fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec<u16> {
+    let pairs = u.len();
+    debug_assert_eq!(u.len(), v.len());
+    let mut out = std::vec::Vec::with_capacity(pairs * 2);
+    for i in 0..pairs {
+      out.push(u[i]);
+      out.push(v[i]);
+    }
+    out
+  }
+
+  fn check_p010_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p010_plane(width, 37);
+    let u_plane = p010_plane(width / 2, 53);
+    let v_plane = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u_plane, &v_plane);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_neon = std::vec![0u8; width * 3];
+
+    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+    }
+    if rgb_scalar != rgb_neon {
+      let diff = rgb_scalar
+        .iter()
+        .zip(rgb_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "NEON P010→u8 diverges at byte {diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgb_scalar[diff], rgb_neon[diff]
+      );
+    }
+  }
+
+  fn check_p010_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p010_plane(width, 37);
+    let u_plane = p010_plane(width / 2, 53);
+    let v_plane = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u_plane, &v_plane);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_neon = std::vec![0u16; width * 3];
+
+    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+    }
+    if rgb_scalar != rgb_neon {
+      let diff = rgb_scalar
+        .iter()
+        .zip(rgb_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      panic!(
+        "NEON P010→u16 diverges at elem {diff} (width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgb_scalar[diff], rgb_neon[diff]
+      );
+    }
+  }
+
+  #[test]
+  fn neon_p010_u8_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u8_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p010_u16_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u16_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn neon_p010_matches_scalar_odd_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p010_u8_equivalence(w, ColorMatrix::Bt601, false);
+      check_p010_u16_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn neon_p010_matches_scalar_1920() {
+    check_p010_u8_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p010_u16_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
+
+  /// Adversarial regression: mispacked input — `yuv420p10le` values
+  /// (10 bits in low 10) accidentally handed to the P010 kernel, or
+  /// arbitrary bit corruption — must still produce bit‑identical
+  /// output on scalar and NEON. The kernel's `>> 6` load extracts
+  /// only the high 10 bits, so any low‑6‑bits data gets deterministically
+  /// discarded in both paths.
+  #[test]
+  fn neon_p010_matches_scalar_on_mispacked_input() {
+    let width = 32;
+
+    // Three input variants:
+    //   - `yuv420p10le_style`: values in low 10 bits (wrong packing
+    //     for P010 — `>> 6` drops the actual data, producing near‑black).
+    //   - `noise`: arbitrary 16‑bit noise, no particular pattern.
+    //   - `every_bit`: each sample has every bit set (0xFFFF).
+    for variant in ["yuv420p10le_style", "noise", "every_bit"] {
+      let y: std::vec::Vec<u16> = match variant {
+        "every_bit" => std::vec![0xFFFFu16; width],
+        "yuv420p10le_style" => (0..width).map(|i| ((i * 37 + 11) & 0x3FF) as u16).collect(),
+        _ => (0..width)
+          .map(|i| ((i as u32 * 53 + 0xDEAD) as u16) ^ 0xA5A5)
+          .collect(),
+      };
+      let uv: std::vec::Vec<u16> = match variant {
+        "every_bit" => std::vec![0xFFFFu16; width],
+        "yuv420p10le_style" => (0..width).map(|i| ((i * 71 + 23) & 0x3FF) as u16).collect(),
+        _ => (0..width)
+          .map(|i| ((i as u32 * 91 + 0xBEEF) as u16) ^ 0x5A5A)
+          .collect(),
+      };
+
+      for matrix in [ColorMatrix::Bt601, ColorMatrix::Bt709, ColorMatrix::YCgCo] {
+        for full_range in [true, false] {
+          let mut rgb_scalar = std::vec![0u8; width * 3];
+          let mut rgb_neon = std::vec![0u8; width * 3];
+          scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+          unsafe {
+            p010_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range);
+          }
+          assert_eq!(
+            rgb_scalar, rgb_neon,
+            "scalar and NEON diverge on {variant} P010 input (matrix={matrix:?}, full_range={full_range})"
+          );
+
+          let mut rgb16_scalar = std::vec![0u16; width * 3];
+          let mut rgb16_neon = std::vec![0u16; width * 3];
+          scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb16_scalar, width, matrix, full_range);
+          unsafe {
+            p010_to_rgb_u16_row(&y, &uv, &mut rgb16_neon, width, matrix, full_range);
+          }
+          assert_eq!(
+            rgb16_scalar, rgb16_neon,
+            "scalar and NEON diverge on {variant} P010 u16 output (matrix={matrix:?}, full_range={full_range})"
+          );
+        }
+      }
+    }
+  }
 }
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 8b0175c..21efa7c 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -41,8 +41,8 @@ use core::arch::wasm32::{
   i16x8_narrow_i32x4, i16x8_splat, i16x8_sub, i32x4_add, i32x4_extend_high_i16x8,
   i32x4_extend_low_i16x8, i32x4_mul, i32x4_shr, i32x4_splat, i32x4_trunc_sat_f32x4,
   u8x16_narrow_i16x8, u8x16_swizzle, u16x8_extend_high_u8x16, u16x8_extend_low_u8x16,
-  u16x8_load_extend_u8x8, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8, v128,
-  v128_and, v128_bitselect, v128_load, v128_or, v128_store,
+  u16x8_load_extend_u8x8, u16x8_shr, u16x8_splat, u32x4_extend_high_u16x8, u32x4_extend_low_u16x8,
+  v128, v128_and, v128_bitselect, v128_load, v128_or, v128_store,
 };
 
 use crate::{ColorMatrix, row::scalar};
@@ -500,6 +500,269 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
   }
 }
 
+/// WASM simd128 P010 → packed **8‑bit** RGB.
+///
+/// Block size 16 Y pixels / 8 chroma pairs per iteration. Mirrors
+/// [`yuv420p10_to_rgb_row`] with two structural differences:
+/// - Samples are shifted right by 6 (`u16x8_shr(_, 6)`) instead of
+///   AND‑masked.
+/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_wasm`]
+///   (two `u8x16_swizzle` + two `i8x16_shuffle` combines).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: simd128 compile‑time availability is the caller's
+  // obligation.
+  unsafe {
+    let rnd_v = i32x4_splat(RND);
+    let y_off_v = i16x8_splat(y_off as i16);
+    let y_scale_v = i32x4_splat(y_scale);
+    let c_scale_v = i32x4_splat(c_scale);
+    let bias_v = i16x8_splat(bias as i16);
+    let cru = i32x4_splat(coeffs.r_u());
+    let crv = i32x4_splat(coeffs.r_v());
+    let cgu = i32x4_splat(coeffs.g_u());
+    let cgv = i32x4_splat(coeffs.g_v());
+    let cbu = i32x4_splat(coeffs.b_u());
+    let cbv = i32x4_splat(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6);
+      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6);
+      let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x));
+      let u_vec = u16x8_shr(u_vec, 6);
+      let v_vec = u16x8_shr(v_vec, 6);
+
+      let u_i16 = i16x8_sub(u_vec, bias_v);
+      let v_i16 = i16x8_sub(v_vec, bias_v);
+
+      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = dup_lo(r_chroma);
+      let r_dup_hi = dup_hi(r_chroma);
+      let g_dup_lo = dup_lo(g_chroma);
+      let g_dup_hi = dup_hi(g_chroma);
+      let b_dup_lo = dup_lo(b_chroma);
+      let b_dup_hi = dup_hi(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = i16x8_add_sat(y_scaled_lo, b_dup_lo);
+      let b_hi = i16x8_add_sat(y_scaled_hi, b_dup_hi);
+      let g_lo = i16x8_add_sat(y_scaled_lo, g_dup_lo);
+      let g_hi = i16x8_add_sat(y_scaled_hi, g_dup_hi);
+      let r_lo = i16x8_add_sat(y_scaled_lo, r_dup_lo);
+      let r_hi = i16x8_add_sat(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
+      let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
+      let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
+
+      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// WASM simd128 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
+/// `yuv420p10le` convention).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: simd128 compile‑time availability is the caller's
+  // obligation.
+  unsafe {
+    let rnd_v = i32x4_splat(RND);
+    let y_off_v = i16x8_splat(y_off as i16);
+    let y_scale_v = i32x4_splat(y_scale);
+    let c_scale_v = i32x4_splat(c_scale);
+    let bias_v = i16x8_splat(bias as i16);
+    let max_v = i16x8_splat(OUT_MAX_10);
+    let zero_v = i16x8_splat(0);
+    let cru = i32x4_splat(coeffs.r_u());
+    let crv = i32x4_splat(coeffs.r_v());
+    let cgu = i32x4_splat(coeffs.g_u());
+    let cgv = i32x4_splat(coeffs.g_v());
+    let cbu = i32x4_splat(coeffs.b_u());
+    let cbv = i32x4_splat(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), 6);
+      let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), 6);
+      let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x));
+      let u_vec = u16x8_shr(u_vec, 6);
+      let v_vec = u16x8_shr(v_vec, 6);
+
+      let u_i16 = i16x8_sub(u_vec, bias_v);
+      let v_i16 = i16x8_sub(v_vec, bias_v);
+
+      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = dup_lo(r_chroma);
+      let r_dup_hi = dup_hi(r_chroma);
+      let g_dup_lo = dup_lo(g_chroma);
+      let g_dup_hi = dup_hi(g_chroma);
+      let b_dup_lo = dup_lo(b_chroma);
+      let b_dup_hi = dup_hi(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let r_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_u16_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Deinterleaves 16 `u16` elements at `ptr` into `(u_vec, v_vec)` —
+/// two 128‑bit vectors each holding 8 `u16` samples. Wasm's
+/// `u8x16_swizzle` is semantically equivalent to SSSE3
+/// `_mm_shuffle_epi8` (indices ≥ 16 zero the lane), so the same
+/// split‑mask pattern applies. `i8x16_shuffle` is used for the
+/// cross‑vector 64‑bit recombine.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes (16 `u16`
+/// elements). Caller must have simd128 enabled at compile time.
+#[inline(always)]
+unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) {
+  unsafe {
+    // Pack evens (U's) into low 8 bytes, odds (V's) into high 8 bytes.
+    let split_mask = i8x16(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+
+    let chunk0 = v128_load(ptr.cast());
+    let chunk1 = v128_load(ptr.add(8).cast());
+
+    let s0 = u8x16_swizzle(chunk0, split_mask);
+    let s1 = u8x16_swizzle(chunk1, split_mask);
+
+    // u_vec = low 8 bytes of s0 + low 8 bytes of s1.
+    // v_vec = high 8 bytes of s0 + high 8 bytes of s1.
+    let u_vec = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(s0, s1);
+    let v_vec =
+      i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(s0, s1);
+    (u_vec, v_vec)
+  }
+}
+
 /// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper
 /// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1434,4 +1697,97 @@ mod tests {
     check_p10_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false);
     check_p10_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- P010 simd128 scalar-equivalence --------------------------------
+
+  fn p010_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6)
+      .collect()
+  }
+
+  fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec<u16> {
+    let pairs = u.len();
+    debug_assert_eq!(u.len(), v.len());
+    let mut out = std::vec::Vec::with_capacity(pairs * 2);
+    for i in 0..pairs {
+      out.push(u[i]);
+      out.push(v[i]);
+    }
+    out
+  }
+
+  fn check_p010_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges");
+  }
+
+  fn check_p010_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges");
+  }
+
+  #[test]
+  fn simd128_p010_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u8_simd128_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p010_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u16_simd128_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_p010_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p010_u8_simd128_equivalence(w, ColorMatrix::Bt601, false);
+      check_p010_u16_simd128_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn simd128_p010_matches_scalar_1920() {
+    check_p010_u8_simd128_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p010_u16_simd128_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index d1a1710..e5b6db7 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -43,8 +43,8 @@ use core::arch::x86_64::{
   _mm256_castsi256_si128, _mm256_cvtepi16_epi32, _mm256_cvtepu8_epi16, _mm256_extracti128_si256,
   _mm256_loadu_si256, _mm256_max_epi16, _mm256_min_epi16, _mm256_mullo_epi32, _mm256_packs_epi32,
   _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16,
-  _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_sub_epi16,
-  _mm256_unpackhi_epi16, _mm256_unpacklo_epi16,
+  _mm256_set1_epi32, _mm256_setr_epi8, _mm256_shuffle_epi8, _mm256_srai_epi32, _mm256_srli_epi16,
+  _mm256_sub_epi16, _mm256_unpackhi_epi16, _mm256_unpacklo_epi16,
 };
 
 use crate::{
@@ -526,6 +526,330 @@ fn clamp_u10_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i {
   unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) }
 }
 
+/// AVX2 P010 → packed **8‑bit** RGB.
+///
+/// Block size 32 Y pixels / 16 chroma pairs per iteration. Mirrors
+/// [`yuv420p10_to_rgb_row`] with two structural differences:
+/// - Samples are shifted right by 6 (`_mm256_srli_epi16::<6>`)
+///   instead of AND‑masked.
+/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx2`]
+///   (two `_mm256_shuffle_epi8` + two `_mm256_permute4x64_epi64` +
+///   two `_mm256_permute2x128_si256` per 32 chroma elements).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: AVX2 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm256_set1_epi32(RND);
+    let y_off_v = _mm256_set1_epi16(y_off as i16);
+    let y_scale_v = _mm256_set1_epi32(y_scale);
+    let c_scale_v = _mm256_set1_epi32(c_scale);
+    let bias_v = _mm256_set1_epi16(bias as i16);
+    let cru = _mm256_set1_epi32(coeffs.r_u());
+    let crv = _mm256_set1_epi32(coeffs.r_v());
+    let cgu = _mm256_set1_epi32(coeffs.g_u());
+    let cgv = _mm256_set1_epi32(coeffs.g_v());
+    let cbu = _mm256_set1_epi32(coeffs.b_u());
+    let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 32 <= width {
+      // 32 Y = two u16×16 loads, shifted right by 6.
+      let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()));
+
+      // 32 UV (16 pairs) — deinterleave + shift.
+      let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x));
+      let u_vec = _mm256_srli_epi16::<6>(u_vec);
+      let v_vec = _mm256_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = _mm256_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm256_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm256_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm256_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm256_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm256_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = narrow_u8x32(b_lo, b_hi);
+      let g_u8 = narrow_u8x32(g_lo, g_hi);
+      let r_u8 = narrow_u8x32(r_lo, r_hi);
+
+      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 32;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// AVX2 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
+/// `yuv420p10le` convention).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: AVX2 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm256_set1_epi32(RND);
+    let y_off_v = _mm256_set1_epi16(y_off as i16);
+    let y_scale_v = _mm256_set1_epi32(y_scale);
+    let c_scale_v = _mm256_set1_epi32(c_scale);
+    let bias_v = _mm256_set1_epi16(bias as i16);
+    let max_v = _mm256_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm256_set1_epi16(0);
+    let cru = _mm256_set1_epi32(coeffs.r_u());
+    let crv = _mm256_set1_epi32(coeffs.r_v());
+    let cgu = _mm256_set1_epi32(coeffs.g_u());
+    let cgv = _mm256_set1_epi32(coeffs.g_v());
+    let cbu = _mm256_set1_epi32(coeffs.b_u());
+    let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 32 <= width {
+      let y_low_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm256_srli_epi16::<6>(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()));
+      let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x));
+      let u_vec = _mm256_srli_epi16::<6>(u_vec);
+      let v_vec = _mm256_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm256_add_epi32(
+        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let r_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_rgb_u16_8(
+        _mm256_castsi256_si128(r_lo),
+        _mm256_castsi256_si128(g_lo),
+        _mm256_castsi256_si128(b_lo),
+        dst,
+      );
+      write_rgb_u16_8(
+        _mm256_extracti128_si256::<1>(r_lo),
+        _mm256_extracti128_si256::<1>(g_lo),
+        _mm256_extracti128_si256::<1>(b_lo),
+        dst.add(24),
+      );
+      write_rgb_u16_8(
+        _mm256_castsi256_si128(r_hi),
+        _mm256_castsi256_si128(g_hi),
+        _mm256_castsi256_si128(b_hi),
+        dst.add(48),
+      );
+      write_rgb_u16_8(
+        _mm256_extracti128_si256::<1>(r_hi),
+        _mm256_extracti128_si256::<1>(g_hi),
+        _mm256_extracti128_si256::<1>(b_hi),
+        dst.add(72),
+      );
+
+      x += 32;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_u16_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Deinterleaves 32 `u16` elements at `ptr` (`[U0, V0, U1, V1, …,
+/// U15, V15]`) into `(u_vec, v_vec)` — two AVX2 vectors each holding
+/// 16 packed `u16` samples.
+///
+/// Uses per‑lane `_mm256_shuffle_epi8` to pack each 128‑bit lane's
+/// U/V samples into the low/high 64 bits, then
+/// `_mm256_permute4x64_epi64::<0xD8>` to move the two U halves
+/// together (low 128) and the two V halves together (high 128) within
+/// each source vector, and finally `_mm256_permute2x128_si256` to
+/// combine the four U halves and the four V halves across the two
+/// vectors. 2 loads + 2 shuffles + 2 per-vector permutes + 2 cross-
+/// vector permutes = 8 ops.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 readable bytes (32 `u16`
+/// elements). Caller's `target_feature` must include AVX2.
+#[inline(always)]
+unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) {
+  unsafe {
+    // Per‑lane byte mask: within each 128‑bit lane, pack even u16s
+    // (U's) into low 8 bytes, odd u16s (V's) into high 8 bytes.
+    let split_mask = _mm256_setr_epi8(
+      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, // low lane
+      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, // high lane
+    );
+
+    let uv0 = _mm256_loadu_si256(ptr.cast());
+    let uv1 = _mm256_loadu_si256(ptr.add(16).cast());
+
+    // After per‑lane shuffle: each vector is
+    // `[U_lane0_lo, V_lane0_lo, U_lane1_lo, V_lane1_lo]` in 64‑bit
+    // chunks.
+    let s0 = _mm256_shuffle_epi8(uv0, split_mask);
+    let s1 = _mm256_shuffle_epi8(uv1, split_mask);
+
+    // Permute 4×64 within each vector to get [U0..U7, V0..V7] and
+    // [U8..U15, V8..V15]. Mask 0xD8 = (3,1,2,0) → picks 64-bit
+    // chunks 0, 2, 1, 3 from the source, rearranging
+    // [A, B, C, D] → [A, C, B, D].
+    let s0_p = _mm256_permute4x64_epi64::<0xD8>(s0);
+    let s1_p = _mm256_permute4x64_epi64::<0xD8>(s1);
+
+    // Cross-vector permute: low 128 of s0_p + low 128 of s1_p → U's;
+    // high 128 of s0_p + high 128 of s1_p → V's.
+    let u_vec = _mm256_permute2x128_si256::<0x20>(s0_p, s1_p);
+    let v_vec = _mm256_permute2x128_si256::<0x31>(s0_p, s1_p);
+    (u_vec, v_vec)
+  }
+}
+
 /// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1375,4 +1699,103 @@ mod tests {
     check_p10_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false);
     check_p10_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- P010 AVX2 scalar-equivalence -----------------------------------
+
+  fn p010_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6)
+      .collect()
+  }
+
+  fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec<u16> {
+    let pairs = u.len();
+    debug_assert_eq!(u.len(), v.len());
+    let mut out = std::vec::Vec::with_capacity(pairs * 2);
+    for i in 0..pairs {
+      out.push(u[i]);
+      out.push(v[i]);
+    }
+    out
+  }
+
+  fn check_p010_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges");
+  }
+
+  fn check_p010_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges");
+  }
+
+  #[test]
+  fn avx2_p010_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u8_avx2_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p010_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u16_avx2_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_p010_matches_scalar_odd_tail_widths() {
+    for w in [34usize, 62, 66, 1922] {
+      check_p010_u8_avx2_equivalence(w, ColorMatrix::Bt601, false);
+      check_p010_u16_avx2_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn avx2_p010_matches_scalar_1920() {
+    check_p010_u8_avx2_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p010_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 0c8f941..19caeaa 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -59,8 +59,8 @@ use core::arch::x86_64::{
   _mm512_extracti64x4_epi64, _mm512_loadu_si512, _mm512_max_epi16, _mm512_min_epi16,
   _mm512_mullo_epi32, _mm512_packs_epi32, _mm512_packus_epi16, _mm512_permutex2var_epi64,
   _mm512_permutexvar_epi64, _mm512_set1_epi16, _mm512_set1_epi32, _mm512_setr_epi64,
-  _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_sub_epi16, _mm512_unpackhi_epi16,
-  _mm512_unpacklo_epi16,
+  _mm512_shuffle_epi8, _mm512_srai_epi32, _mm512_srli_epi16, _mm512_sub_epi16,
+  _mm512_unpackhi_epi16, _mm512_unpacklo_epi16,
 };
 
 use crate::{
@@ -572,6 +572,317 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
   }
 }
 
+/// AVX‑512 P010 → packed **8‑bit** RGB.
+///
+/// Block size 64 Y pixels / 32 chroma pairs per iteration. Mirrors
+/// [`yuv420p10_to_rgb_row`] with two structural differences:
+/// - Samples are shifted right by 6 (`_mm512_srli_epi16::<6>`)
+///   instead of AND‑masked.
+/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16_avx512`]
+///   — per‑128‑lane shuffle + 64‑bit permute + cross‑vector
+///   `_mm512_permutex2var_epi64` to produce 32‑sample U and V
+///   vectors.
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+///
+/// # Safety
+///
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: AVX‑512BW availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm512_set1_epi32(RND);
+    let y_off_v = _mm512_set1_epi16(y_off as i16);
+    let y_scale_v = _mm512_set1_epi32(y_scale);
+    let c_scale_v = _mm512_set1_epi32(c_scale);
+    let bias_v = _mm512_set1_epi16(bias as i16);
+    let cru = _mm512_set1_epi32(coeffs.r_u());
+    let crv = _mm512_set1_epi32(coeffs.r_v());
+    let cgu = _mm512_set1_epi32(coeffs.g_u());
+    let cgv = _mm512_set1_epi32(coeffs.g_v());
+    let cbu = _mm512_set1_epi32(coeffs.b_u());
+    let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+    let mut x = 0usize;
+    while x + 64 <= width {
+      let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()));
+      let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x));
+      let u_vec = _mm512_srli_epi16::<6>(u_vec);
+      let v_vec = _mm512_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+      let b_lo = _mm512_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm512_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm512_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm512_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm512_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm512_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup);
+      let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
+      let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
+
+      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 64;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// AVX‑512 P010 → packed **10‑bit `u16`** RGB (low‑bit‑packed
+/// `yuv420p10le` convention).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+///
+/// # Safety
+///
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: AVX‑512BW availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm512_set1_epi32(RND);
+    let y_off_v = _mm512_set1_epi16(y_off as i16);
+    let y_scale_v = _mm512_set1_epi32(y_scale);
+    let c_scale_v = _mm512_set1_epi32(c_scale);
+    let bias_v = _mm512_set1_epi16(bias as i16);
+    let max_v = _mm512_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm512_set1_epi16(0);
+    let cru = _mm512_set1_epi32(coeffs.r_u());
+    let crv = _mm512_set1_epi32(coeffs.r_v());
+    let cgu = _mm512_set1_epi32(coeffs.g_u());
+    let cgv = _mm512_set1_epi32(coeffs.g_v());
+    let cbu = _mm512_set1_epi32(coeffs.b_u());
+    let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+    let mut x = 0usize;
+    while x + 64 <= width {
+      let y_low_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm512_srli_epi16::<6>(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()));
+      let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x));
+      let u_vec = _mm512_srli_epi16::<6>(u_vec);
+      let v_vec = _mm512_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+      let u_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let u_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_lo = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+        rnd_v,
+      ));
+      let v_d_hi = q15_shift(_mm512_add_epi32(
+        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+        rnd_v,
+      ));
+
+      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+      let (r_dup_lo, r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+      let (g_dup_lo, g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+      let (b_dup_lo, b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+      let r_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      let dst = rgb_out.as_mut_ptr().add(x * 3);
+      write_quarter(r_lo, g_lo, b_lo, 0, dst);
+      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+
+      x += 64;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_u16_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Deinterleaves 64 `u16` elements at `ptr` into `(u_vec, v_vec)` —
+/// two AVX‑512 vectors each holding 32 packed `u16` samples.
+///
+/// Per‑128‑bit‑lane `_mm512_shuffle_epi8` packs even u16s (U's) into
+/// each lane's low 64 bits, odd u16s (V's) into the high 64. Then
+/// `_mm512_permutexvar_epi64` with the existing `pack_fixup` index
+/// `[0, 2, 4, 6, 1, 3, 5, 7]` rearranges the 64‑bit chunks so each
+/// vector becomes `[U0..U15 | V0..V15]`. Finally
+/// `_mm512_permutex2var_epi64` combines the two vectors into the
+/// full 32‑sample U and V vectors.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 128 readable bytes (64 `u16`
+/// elements). Caller's `target_feature` must include AVX‑512F +
+/// AVX‑512BW.
+#[inline(always)]
+unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) {
+  unsafe {
+    // Per‑128‑lane mask (same byte pattern replicated across the 4
+    // lanes of a `__m512i`).
+    let split_mask = _mm512_broadcast_i32x4(_mm_setr_epi8(
+      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+    ));
+    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    // Cross-vector 2x8 permute indices:
+    //   u_vec = low 256 of each vec → chunks [0..3 of a, 0..3 of b]
+    //   v_vec = high 256 of each vec → chunks [4..7 of a, 4..7 of b]
+    let u_perm = _mm512_setr_epi64(0, 1, 2, 3, 8, 9, 10, 11);
+    let v_perm = _mm512_setr_epi64(4, 5, 6, 7, 12, 13, 14, 15);
+
+    let uv0 = _mm512_loadu_si512(ptr.cast());
+    let uv1 = _mm512_loadu_si512(ptr.add(32).cast());
+
+    let s0 = _mm512_shuffle_epi8(uv0, split_mask);
+    let s1 = _mm512_shuffle_epi8(uv1, split_mask);
+
+    // After per-lane shuffle + per-vector 64-bit permute, each vector
+    // is `[U0..U15 | V0..V15]` (low 256 = U's, high 256 = V's).
+    let s0_p = _mm512_permutexvar_epi64(pack_fixup, s0);
+    let s1_p = _mm512_permutexvar_epi64(pack_fixup, s1);
+
+    let u_vec = _mm512_permutex2var_epi64(s0_p, u_perm, s1_p);
+    let v_vec = _mm512_permutex2var_epi64(s0_p, v_perm, s1_p);
+    (u_vec, v_vec)
+  }
+}
+
 /// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1425,4 +1736,103 @@ mod tests {
     check_p10_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false);
     check_p10_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- P010 AVX-512 scalar-equivalence --------------------------------
+
+  fn p010_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6)
+      .collect()
+  }
+
+  fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec<u16> {
+    let pairs = u.len();
+    debug_assert_eq!(u.len(), v.len());
+    let mut out = std::vec::Vec::with_capacity(pairs * 2);
+    for i in 0..pairs {
+      out.push(u[i]);
+      out.push(v[i]);
+    }
+    out
+  }
+
+  fn check_p010_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges");
+  }
+
+  fn check_p010_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges");
+  }
+
+  #[test]
+  fn avx512_p010_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u8_avx512_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p010_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u16_avx512_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_p010_matches_scalar_odd_tail_widths() {
+    for w in [66usize, 126, 130, 1922] {
+      check_p010_u8_avx512_equivalence(w, ColorMatrix::Bt601, false);
+      check_p010_u16_avx512_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn avx512_p010_matches_scalar_1920() {
+    check_p010_u8_avx512_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p010_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 297f1fb..66e385b 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -39,7 +39,8 @@ use core::arch::x86_64::{
   __m128i, _mm_add_epi32, _mm_adds_epi16, _mm_and_si128, _mm_cvtepi16_epi32, _mm_cvtepu8_epi16,
   _mm_loadl_epi64, _mm_loadu_si128, _mm_max_epi16, _mm_min_epi16, _mm_mullo_epi32, _mm_packs_epi32,
   _mm_packus_epi16, _mm_set1_epi16, _mm_set1_epi32, _mm_setr_epi8, _mm_shuffle_epi8,
-  _mm_srai_epi32, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16, _mm_unpacklo_epi16,
+  _mm_srai_epi32, _mm_srli_epi16, _mm_srli_si128, _mm_sub_epi16, _mm_unpackhi_epi16,
+  _mm_unpackhi_epi64, _mm_unpacklo_epi16, _mm_unpacklo_epi64,
 };
 
 use crate::{
@@ -192,6 +193,271 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   }
 }
 
+/// SSE4.1 P010 → packed **8‑bit** RGB.
+///
+/// Block size 16 Y pixels / 8 chroma pairs per iteration. Differences
+/// from [`yuv420p10_to_rgb_row`]:
+/// - Samples are shifted right by 6 (`_mm_srli_epi16::<6>`) instead
+///   of AND‑masked — P010's 10 active bits live in the HIGH 10 of
+///   each `u16`.
+/// - Semi‑planar UV is deinterleaved via [`deinterleave_uv_u16`]
+///   below (one `_mm_shuffle_epi8` + two 64‑bit unpacks per 16
+///   chroma elements).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_row`].
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 8>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+
+  // SAFETY: SSE4.1 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm_set1_epi32(RND);
+    let y_off_v = _mm_set1_epi16(y_off as i16);
+    let y_scale_v = _mm_set1_epi32(y_scale);
+    let c_scale_v = _mm_set1_epi32(c_scale);
+    let bias_v = _mm_set1_epi16(bias as i16);
+    let cru = _mm_set1_epi32(coeffs.r_u());
+    let crv = _mm_set1_epi32(coeffs.r_v());
+    let cgu = _mm_set1_epi32(coeffs.g_u());
+    let cgv = _mm_set1_epi32(coeffs.g_v());
+    let cbu = _mm_set1_epi32(coeffs.b_u());
+    let cbv = _mm_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      // Y: two u16×8 loads, each shifted right by 6.
+      let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()));
+
+      // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then
+      // deinterleave into separate u_vec + v_vec.
+      let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x));
+      let u_vec = _mm_srli_epi16::<6>(u_vec);
+      let v_vec = _mm_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma);
+      let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma);
+      let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma);
+      let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma);
+      let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma);
+      let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let b_lo = _mm_adds_epi16(y_scaled_lo, b_dup_lo);
+      let b_hi = _mm_adds_epi16(y_scaled_hi, b_dup_hi);
+      let g_lo = _mm_adds_epi16(y_scaled_lo, g_dup_lo);
+      let g_hi = _mm_adds_epi16(y_scaled_hi, g_dup_hi);
+      let r_lo = _mm_adds_epi16(y_scaled_lo, r_dup_lo);
+      let r_hi = _mm_adds_epi16(y_scaled_hi, r_dup_hi);
+
+      let b_u8 = _mm_packus_epi16(b_lo, b_hi);
+      let g_u8 = _mm_packus_epi16(g_lo, g_hi);
+      let r_u8 = _mm_packus_epi16(r_lo, r_hi);
+
+      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// SSE4.1 P010 → packed **10‑bit `u16`** RGB (native‑depth,
+/// low‑bit‑packed — `yuv420p10le` convention).
+///
+/// # Numerical contract
+///
+/// Byte‑identical to [`scalar::p010_to_rgb_u16_row`].
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `rgb_out.len() >= 3 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0);
+  debug_assert!(y.len() >= width);
+  debug_assert!(uv_half.len() >= width);
+  debug_assert!(rgb_out.len() >= width * 3);
+
+  let coeffs = scalar::Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = scalar::range_params_n::<10, 10>(full_range);
+  let bias = scalar::chroma_bias::<10>();
+  const RND: i32 = 1 << 14;
+  const OUT_MAX_10: i16 = 1023;
+
+  // SAFETY: SSE4.1 availability is the caller's obligation.
+  unsafe {
+    let rnd_v = _mm_set1_epi32(RND);
+    let y_off_v = _mm_set1_epi16(y_off as i16);
+    let y_scale_v = _mm_set1_epi32(y_scale);
+    let c_scale_v = _mm_set1_epi32(c_scale);
+    let bias_v = _mm_set1_epi16(bias as i16);
+    let max_v = _mm_set1_epi16(OUT_MAX_10);
+    let zero_v = _mm_set1_epi16(0);
+    let cru = _mm_set1_epi32(coeffs.r_u());
+    let crv = _mm_set1_epi32(coeffs.r_v());
+    let cgu = _mm_set1_epi32(coeffs.g_u());
+    let cgv = _mm_set1_epi32(coeffs.g_v());
+    let cbu = _mm_set1_epi32(coeffs.b_u());
+    let cbv = _mm_set1_epi32(coeffs.b_v());
+
+    let mut x = 0usize;
+    while x + 16 <= width {
+      let y_low_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x).cast()));
+      let y_high_i16 = _mm_srli_epi16::<6>(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()));
+      let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x));
+      let u_vec = _mm_srli_epi16::<6>(u_vec);
+      let v_vec = _mm_srli_epi16::<6>(v_vec);
+
+      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+      let r_dup_lo = _mm_unpacklo_epi16(r_chroma, r_chroma);
+      let r_dup_hi = _mm_unpackhi_epi16(r_chroma, r_chroma);
+      let g_dup_lo = _mm_unpacklo_epi16(g_chroma, g_chroma);
+      let g_dup_hi = _mm_unpackhi_epi16(g_chroma, g_chroma);
+      let b_dup_lo = _mm_unpacklo_epi16(b_chroma, b_chroma);
+      let b_dup_hi = _mm_unpackhi_epi16(b_chroma, b_chroma);
+
+      let y_scaled_lo = scale_y(y_low_i16, y_off_v, y_scale_v, rnd_v);
+      let y_scaled_hi = scale_y(y_high_i16, y_off_v, y_scale_v, rnd_v);
+
+      let r_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, r_dup_lo), zero_v, max_v);
+      let r_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, r_dup_hi), zero_v, max_v);
+      let g_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, g_dup_lo), zero_v, max_v);
+      let g_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, g_dup_hi), zero_v, max_v);
+      let b_lo = clamp_u10(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
+      let b_hi = clamp_u10(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
+
+      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
+      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+
+      x += 16;
+    }
+
+    if x < width {
+      scalar::p010_to_rgb_u16_row(
+        &y[x..width],
+        &uv_half[x..width],
+        &mut rgb_out[x * 3..width * 3],
+        width - x,
+        matrix,
+        full_range,
+      );
+    }
+  }
+}
+
+/// Deinterleaves 16 `u16` elements at `ptr` (`[U0, V0, U1, V1, …,
+/// U7, V7]`) into `(u_vec, v_vec)` where each vector holds 8 packed
+/// `u16` samples.
+///
+/// Each of the two 128‑bit loads is byte‑shuffled via
+/// `_mm_shuffle_epi8` so that U samples land in the low 64 bits and
+/// V samples in the high 64. Then `_mm_unpacklo_epi64` /
+/// `_mm_unpackhi_epi64` combine the two halves into full u16×8
+/// vectors. 2 loads + 2 shuffles + 2 unpacks = 6 ops.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes (16 `u16`
+/// elements). Caller's `target_feature` must include SSSE3 (via
+/// SSE4.1 or a superset).
+#[inline(always)]
+unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) {
+  unsafe {
+    // Per‑chunk mask: pack even u16s (U's) into low 8 bytes, odd u16s
+    // (V's) into high 8 bytes.
+    let split_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    let chunk0 = _mm_loadu_si128(ptr.cast());
+    let chunk1 = _mm_loadu_si128(ptr.add(8).cast());
+    let s0 = _mm_shuffle_epi8(chunk0, split_mask);
+    let s1 = _mm_shuffle_epi8(chunk1, split_mask);
+    let u_vec = _mm_unpacklo_epi64(s0, s1);
+    let v_vec = _mm_unpackhi_epi64(s0, s1);
+    (u_vec, v_vec)
+  }
+}
+
 /// SSE4.1 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
 ///
@@ -1212,4 +1478,103 @@ mod tests {
     check_p10_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false);
     check_p10_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
   }
+
+  // ---- P010 SSE4.1 scalar-equivalence ----------------------------------
+
+  fn p010_plane(n: usize, seed: usize) -> std::vec::Vec<u16> {
+    (0..n)
+      .map(|i| (((i * seed + seed * 3) & 0x3FF) as u16) << 6)
+      .collect()
+  }
+
+  fn p010_uv_interleave(u: &[u16], v: &[u16]) -> std::vec::Vec<u16> {
+    let pairs = u.len();
+    debug_assert_eq!(u.len(), v.len());
+    let mut out = std::vec::Vec::with_capacity(pairs * 2);
+    for i in 0..pairs {
+      out.push(u[i]);
+      out.push(v[i]);
+    }
+    out
+  }
+
+  fn check_p010_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u8; width * 3];
+    let mut rgb_simd = std::vec![0u8; width * 3];
+    scalar::p010_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges");
+  }
+
+  fn check_p010_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    let y = p010_plane(width, 37);
+    let u = p010_plane(width / 2, 53);
+    let v = p010_plane(width / 2, 71);
+    let uv = p010_uv_interleave(&u, &v);
+    let mut rgb_scalar = std::vec![0u16; width * 3];
+    let mut rgb_simd = std::vec![0u16; width * 3];
+    scalar::p010_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range);
+    unsafe {
+      p010_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range);
+    }
+    assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges");
+  }
+
+  #[test]
+  fn sse41_p010_u8_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u8_sse41_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p010_u16_matches_scalar_all_matrices() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_p010_u16_sse41_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_p010_matches_scalar_odd_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_p010_u8_sse41_equivalence(w, ColorMatrix::Bt601, false);
+      check_p010_u16_sse41_equivalence(w, ColorMatrix::Bt709, true);
+    }
+  }
+
+  #[test]
+  fn sse41_p010_matches_scalar_1920() {
+    check_p010_u8_sse41_equivalence(1920, ColorMatrix::Bt709, false);
+    check_p010_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
+  }
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 788789b..80afab7 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -498,6 +498,158 @@ pub fn yuv420p10_to_rgb_u16_row(
   scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
+/// packed — 10 active bits in the high 10 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// This is the HDR hardware‑decode keystone format: VideoToolbox,
+/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
+/// output. See `scalar::p010_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p010_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** to **native‑depth `u16`** packed RGB
+/// (10 active bits in the **low** 10 of each output `u16`, matching
+/// `yuv420p10le` convention — **not** the P010 high‑bit packing).
+/// Callers feeding this output into a P010 consumer must shift left
+/// by 6.
+///
+/// See `scalar::p010_to_rgb_u16_row` for the full spec.
+/// `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p010_to_rgb_u16_row(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p010_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
 /// encoding). See `scalar::rgb_to_hsv_row` for semantics.
 ///
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 9c6afa4..527ea4d 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -344,6 +344,135 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   }
 }
 
+// ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------
+
+/// Converts one row of P010 (semi‑planar 4:2:0 with UV interleaved,
+/// 10 active bits in the **high** 10 of each `u16`) to **8‑bit**
+/// packed RGB.
+///
+/// Structurally identical to [`nv12_to_rgb_row`] plus the per‑sample
+/// shift: each `u16` load is extracted to its 10‑bit value via
+/// `sample >> 6`, then the same Q15 pipeline as
+/// [`yuv_420p_n_to_rgb_row`] runs with `BITS == 10`. Mispacked input
+/// — e.g. a `yuv420p10le` buffer with values in the **low** 10 bits
+/// — is masked down to a small positive number (producing near‑black
+/// output) rather than silent garbage, matching every SIMD backend.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `uv_half.len() >= width`,
+///   `rgb_out.len() >= 3 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0, "P010 requires even width");
+  debug_assert!(y.len() >= width, "y row too short");
+  debug_assert!(uv_half.len() >= width, "uv row too short");
+  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+
+  let coeffs = Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range);
+  let bias = chroma_bias::<10>();
+
+  // Each `u16` load is converted to its 10-bit sample with `>> 6`,
+  // extracting the upper 10 bits and leaving the result in
+  // `[0, 1023]`. If low-packed input (`yuv420p10le`) is handed to
+  // this kernel by mistake, that shift discards the active low 6 bits
+  // rather than recovering the intended 10-bit value. No hot-path
+  // cost: one shift per load.
+  let mut x = 0;
+  while x < width {
+    let c_idx = x / 2;
+    let u_sample = uv_half[c_idx * 2] >> 6;
+    let v_sample = uv_half[c_idx * 2 + 1] >> 6;
+    let u_d = q15_scale(u_sample as i32 - bias, c_scale);
+    let v_d = q15_scale(v_sample as i32 - bias, c_scale);
+
+    let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
+    let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
+    let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
+
+    let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale);
+    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
+    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
+    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+
+    let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale);
+    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
+    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
+    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+
+    x += 2;
+  }
+}
+
+/// Converts one row of P010 to **native‑depth `u16`** packed RGB
+/// (10 active bits in the low bits of each `u16`, matching
+/// `yuv420p10le` convention — **not** P010's high‑bit packing).
+///
+/// Mirrors [`yuv_420p_n_to_rgb_u16_row::<10>`] on the math side; the
+/// only difference is the input shift (`sample >> 6` instead of
+/// `sample & 0x3FF`) and the UV deinterleave. Output is suitable for
+/// direct consumption by downstream `yuv420p10le`‑shaped tooling. If
+/// you need P010‑packed RGB output, shift left by 6 on the caller.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `uv_half.len() >= width`,
+///   `rgb_out.len() >= 3 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  debug_assert_eq!(width & 1, 0, "P010 requires even width");
+  debug_assert!(y.len() >= width, "y row too short");
+  debug_assert!(uv_half.len() >= width, "uv row too short");
+  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+
+  let coeffs = Coefficients::for_matrix(matrix);
+  let (y_off, y_scale, c_scale) = range_params_n::<10, 10>(full_range);
+  let bias = chroma_bias::<10>();
+  let out_max: i32 = (1i32 << 10) - 1;
+
+  let mut x = 0;
+  while x < width {
+    let c_idx = x / 2;
+    let u_sample = uv_half[c_idx * 2] >> 6;
+    let v_sample = uv_half[c_idx * 2 + 1] >> 6;
+    let u_d = q15_scale(u_sample as i32 - bias, c_scale);
+    let v_d = q15_scale(v_sample as i32 - bias, c_scale);
+
+    let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
+    let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
+    let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
+
+    let y0 = q15_scale((y[x] >> 6) as i32 - y_off, y_scale);
+    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+
+    let y1 = q15_scale((y[x + 1] >> 6) as i32 - y_off, y_scale);
+    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+
+    x += 2;
+  }
+}
+
 /// Compile‑time sample mask for `BITS`: `(1 << BITS) - 1` as `u16`.
 /// Returns `0x03FF` for 10‑bit, `0x0FFF` for 12‑bit, `0x3FFF` for
 /// 14‑bit. SIMD backends splat this into a vector constant and AND
@@ -990,4 +1119,112 @@ mod tests {
       "matrices should materially differ: {bt709:?} vs {ycgco:?}"
     );
   }
+
+  // ---- p010_to_rgb_row (P010 → u8) ---------------------------------------
+  //
+  // P010 samples: 10 active bits in the HIGH 10 of each u16.
+  // White Y = 1023 << 6 = 0xFFC0, neutral UV = 512 << 6 = 0x8000.
+
+  #[test]
+  fn p010_rgb_black_full_range() {
+    // Y = 0, neutral UV → black.
+    let y = [0u16; 4];
+    let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1
+    let mut rgb = [0u8; 12];
+    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}");
+  }
+
+  #[test]
+  fn p010_rgb_white_full_range() {
+    // Y = 0xFFC0 = 1023 << 6, neutral UV → white.
+    let y = [0xFFC0u16; 4];
+    let uv = [0x8000u16, 0x8000, 0x8000, 0x8000];
+    let mut rgb = [0u8; 12];
+    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}");
+  }
+
+  #[test]
+  fn p010_rgb_gray_is_gray() {
+    // 10-bit mid-gray Y=512 → P010 Y = 512 << 6 = 0x8000.
+    let y = [0x8000u16; 4];
+    let uv = [0x8000u16; 4];
+    let mut rgb = [0u8; 12];
+    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    for x in 0..4 {
+      let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]);
+      assert_eq!(r, g);
+      assert_eq!(g, b);
+      assert!(r.abs_diff(128) <= 1, "got {r}");
+    }
+  }
+
+  #[test]
+  fn p010_rgb_limited_range_endpoints() {
+    // 10-bit limited black Y=64 → P010 = 64 << 6 = 0x1000.
+    // 10-bit limited white Y=940 → P010 = 940 << 6 = 0xEB00.
+    let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00];
+    let uv = [0x8000u16, 0x8000, 0x8000, 0x8000];
+    let mut rgb = [0u8; 12];
+    p010_to_rgb_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false);
+    assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
+    assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0));
+    assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255));
+    assert_eq!((rgb[9], rgb[10], rgb[11]), (255, 255, 255));
+  }
+
+  #[test]
+  fn p010_matches_yuv420p10_when_shifted() {
+    // Handing the same logical samples to P010 (high-packed) and
+    // yuv420p10 (low-packed) must produce the same RGB output.
+    let y_p10 = [200u16, 800, 500, 700]; // 10-bit values
+    let u_p10 = [600u16, 400]; // 10-bit values
+    let v_p10 = [300u16, 900]; // 10-bit values
+
+    let y_p010: [u16; 4] = core::array::from_fn(|i| y_p10[i] << 6);
+    let uv_p010: [u16; 4] = [u_p10[0] << 6, v_p10[0] << 6, u_p10[1] << 6, v_p10[1] << 6];
+
+    let mut rgb_p10 = [0u8; 12];
+    let mut rgb_p010 = [0u8; 12];
+    yuv_420p_n_to_rgb_row::<10>(
+      &y_p10,
+      &u_p10,
+      &v_p10,
+      &mut rgb_p10,
+      4,
+      ColorMatrix::Bt709,
+      true,
+    );
+    p010_to_rgb_row(
+      &y_p010,
+      &uv_p010,
+      &mut rgb_p010,
+      4,
+      ColorMatrix::Bt709,
+      true,
+    );
+    assert_eq!(rgb_p10, rgb_p010);
+  }
+
+  // ---- p010_to_rgb_u16_row (P010 → native-depth u16) --------------------
+
+  #[test]
+  fn p010_rgb_u16_white_full_range() {
+    let y = [0xFFC0u16; 4];
+    let uv = [0x8000u16; 4];
+    let mut rgb = [0u16; 12];
+    p010_to_rgb_u16_row(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true);
+    assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}");
+  }
+
+  #[test]
+  fn p010_rgb_u16_limited_range_endpoints() {
+    let y = [0x1000u16, 0xEB00];
+    let uv = [0x8000u16, 0x8000];
+    let mut rgb = [0u16; 6];
+    p010_to_rgb_u16_row(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false);
+    assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0));
+    assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023));
+  }
 }
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 432994e..92835a5 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -19,12 +19,12 @@ use thiserror::Error;
 use crate::{
   HsvBuffers, PixelSink, SourceFormat,
   row::{
-    nv12_to_rgb_row, nv21_to_rgb_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv420p10_to_rgb_row,
-    yuv420p10_to_rgb_u16_row,
+    nv12_to_rgb_row, nv21_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, rgb_to_hsv_row,
+    yuv_420_to_rgb_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row,
   },
   yuv::{
-    Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, Yuv420p, Yuv420p10, Yuv420p10Row,
-    Yuv420p10Sink, Yuv420pRow, Yuv420pSink,
+    Nv12, Nv12Row, Nv12Sink, Nv21, Nv21Row, Nv21Sink, P010, P010Row, P010Sink, Yuv420p, Yuv420p10,
+    Yuv420p10Row, Yuv420p10Sink, Yuv420pRow, Yuv420pSink,
   },
 };
 
@@ -219,6 +219,12 @@ pub enum RowSlice {
   /// `width / 2` elements.
   #[display("V Half 10")]
   VHalf10,
+  /// Half‑width interleaved UV row of a **10‑bit semi‑planar** source
+  /// ([`P010`]). `u16` samples, `width` elements laid out as
+  /// `U0, V0, U1, V1, …` (high‑bit‑packed: each element's 10 active
+  /// bits sit in the high 10 of its `u16`).
+  #[display("UV Half 10")]
+  UvHalf10,
 }
 
 /// A sink that writes any subset of `{RGB, Luma, HSV}` into
@@ -1108,6 +1114,189 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> {
   }
 }
 
+// ---- P010 impl ---------------------------------------------------------
+
+impl<'a> MixedSinker<'a, P010> {
+  /// Attaches a packed **`u16`** RGB output buffer. Mirrors
+  /// [`MixedSinker<Yuv420p10>::with_rgb_u16`] — compile‑time gated to
+  /// sinkers whose source format populates native‑depth RGB.
+  ///
+  /// Length is measured in `u16` **elements** (not bytes): minimum
+  /// `width × height × 3`. Output is **low‑bit‑packed** (10‑bit
+  /// values in the low 10 of each `u16`, upper 6 zero) — matches
+  /// FFmpeg `yuv420p10le` convention. This is **not** P010 packing
+  /// (which puts the 10 bits in the high 10); callers feeding a P010
+  /// consumer must shift the output left by 6.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgb_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgb_u16(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgb_u16`](Self::with_rgb_u16). The
+  /// required length is measured in `u16` **elements**, not bytes.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgb_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected_elements = self.frame_bytes(3)?;
+    if buf.len() < expected_elements {
+      return Err(MixedSinkerError::RgbU16BufferTooShort {
+        expected: expected_elements,
+        actual: buf.len(),
+      });
+    }
+    self.rgb_u16 = Some(buf);
+    Ok(self)
+  }
+}
+
+impl P010Sink for MixedSinker<'_, P010> {}
+
+impl PixelSink for MixedSinker<'_, P010> {
+  type Input<'r> = P010Row<'r>;
+  type Error = MixedSinkerError;
+
+  fn begin_frame(&mut self, width: u32, height: u32) -> Result<(), Self::Error> {
+    if self.width & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: self.width });
+    }
+    check_dimensions_match(self.width, self.height, width, height)
+  }
+
+  fn process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error> {
+    let w = self.width;
+    let h = self.height;
+    let idx = row.row();
+    let use_simd = self.simd;
+
+    if w & 1 != 0 {
+      return Err(MixedSinkerError::OddWidth { width: w });
+    }
+    if row.y().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::Y10,
+        row: idx,
+        expected: w,
+        actual: row.y().len(),
+      });
+    }
+    // Semi-planar UV: `width` u16 elements total (`width / 2` pairs).
+    if row.uv_half().len() != w {
+      return Err(MixedSinkerError::RowShapeMismatch {
+        which: RowSlice::UvHalf10,
+        row: idx,
+        expected: w,
+        actual: row.uv_half().len(),
+      });
+    }
+    if idx >= self.height {
+      return Err(MixedSinkerError::RowIndexOutOfRange {
+        row: idx,
+        configured_height: self.height,
+      });
+    }
+
+    let Self {
+      rgb,
+      rgb_u16,
+      luma,
+      hsv,
+      rgb_scratch,
+      ..
+    } = self;
+
+    let one_plane_start = idx * w;
+    let one_plane_end = one_plane_start + w;
+
+    // Luma: P010 samples are high-bit-packed (`value << 6`). Taking
+    // the high byte via `>> 8` gives the top 8 bits of the 10-bit
+    // value — functionally equivalent to
+    // `(value >> 2)` for the yuv420p10 path.
+    if let Some(luma) = luma.as_deref_mut() {
+      let dst = &mut luma[one_plane_start..one_plane_end];
+      for (d, &s) in dst.iter_mut().zip(row.y().iter()) {
+        *d = (s >> 8) as u8;
+      }
+    }
+
+    // `u16` RGB output — low-bit-packed 10-bit values (yuv420p10le
+    // convention), not P010's high-bit packing.
+    if let Some(buf) = rgb_u16.as_deref_mut() {
+      let rgb_plane_end =
+        one_plane_end
+          .checked_mul(3)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 3,
+          })?;
+      let rgb_plane_start = one_plane_start * 3;
+      p010_to_rgb_u16_row(
+        row.y(),
+        row.uv_half(),
+        &mut buf[rgb_plane_start..rgb_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
+    let want_rgb = rgb.is_some();
+    let want_hsv = hsv.is_some();
+    if !want_rgb && !want_hsv {
+      return Ok(());
+    }
+
+    let rgb_row: &mut [u8] = match rgb.as_deref_mut() {
+      Some(buf) => {
+        let rgb_plane_end =
+          one_plane_end
+            .checked_mul(3)
+            .ok_or(MixedSinkerError::GeometryOverflow {
+              width: w,
+              height: h,
+              channels: 3,
+            })?;
+        let rgb_plane_start = one_plane_start * 3;
+        &mut buf[rgb_plane_start..rgb_plane_end]
+      }
+      None => {
+        let rgb_row_bytes = w.checked_mul(3).ok_or(MixedSinkerError::GeometryOverflow {
+          width: w,
+          height: h,
+          channels: 3,
+        })?;
+        if rgb_scratch.len() < rgb_row_bytes {
+          rgb_scratch.resize(rgb_row_bytes, 0);
+        }
+        &mut rgb_scratch[..rgb_row_bytes]
+      }
+    };
+
+    p010_to_rgb_row(
+      row.y(),
+      row.uv_half(),
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+    );
+
+    if let Some(hsv) = hsv.as_mut() {
+      rgb_to_hsv_row(
+        rgb_row,
+        &mut hsv.h[one_plane_start..one_plane_end],
+        &mut hsv.s[one_plane_start..one_plane_end],
+        &mut hsv.v[one_plane_start..one_plane_end],
+        w,
+        use_simd,
+      );
+    }
+    Ok(())
+  }
+}
+
 /// Returns `Ok(())` iff the walker's frame dimensions exactly match
 /// the sinker's configured dimensions. Called from
 /// [`PixelSink::begin_frame`] on both `MixedSinker<Yuv420p>` and
@@ -1145,8 +1334,8 @@ mod tests {
   use super::*;
   use crate::{
     ColorMatrix,
-    frame::{Nv12Frame, Nv21Frame, Yuv420p10Frame, Yuv420pFrame},
-    yuv::{nv12_to, nv21_to, yuv420p_to, yuv420p10_to},
+    frame::{Nv12Frame, Nv21Frame, P010Frame, Yuv420p10Frame, Yuv420pFrame},
+    yuv::{nv12_to, nv21_to, p010_to, yuv420p_to, yuv420p10_to},
   };
 
   fn solid_yuv420p_frame(
@@ -2170,4 +2359,176 @@ mod tests {
     assert_eq!(rgb_scalar, rgb_simd);
     assert_eq!(rgb_u16_scalar, rgb_u16_simd);
   }
+
+  // ---- P010 --------------------------------------------------------------
+  //
+  // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each
+  // u16). Mirrors the Yuv420p10 test shape but with UV interleaved.
+
+  fn solid_p010_frame(
+    width: u32,
+    height: u32,
+    y_10bit: u16,
+    u_10bit: u16,
+    v_10bit: u16,
+  ) -> (Vec<u16>, Vec<u16>) {
+    let w = width as usize;
+    let h = height as usize;
+    let cw = w / 2;
+    let ch = h / 2;
+    // Shift into the high 10 bits (P010 packing).
+    let y = std::vec![y_10bit << 6; w * h];
+    let uv: Vec<u16> = (0..cw * ch)
+      .flat_map(|_| [u_10bit << 6, v_10bit << 6])
+      .collect();
+    (y, uv)
+  }
+
+  #[test]
+  fn p010_rgb_u8_only_gray_is_gray() {
+    // 10-bit mid-gray Y=512, UV=512 → ~128 u8 RGB across the frame.
+    let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512);
+    let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb = std::vec![0u8; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P010>::new(16, 8).with_rgb(&mut rgb).unwrap();
+    p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+    }
+  }
+
+  #[test]
+  fn p010_rgb_u16_only_native_depth_gray() {
+    // Output u16 is yuv420p10le-packed (10-bit in low 10) even though
+    // the input is P010-packed.
+    let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512);
+    let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P010>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .unwrap();
+    p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgb.chunks(3) {
+      assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      assert!(
+        px[0] <= 1023,
+        "output must stay within 10-bit low-packed range"
+      );
+    }
+  }
+
+  #[test]
+  fn p010_rgb_u8_and_u16_both_populated() {
+    // 10-bit full-range white: Y=1023, UV=512. Both buffers fill in
+    // one call.
+    let (yp, uvp) = solid_p010_frame(16, 8, 1023, 512, 512);
+    let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+    let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+    let mut sink = MixedSinker::<P010>::new(16, 8)
+      .with_rgb(&mut rgb_u8)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap();
+    p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(rgb_u8.iter().all(|&c| c == 255));
+    assert!(rgb_u16.iter().all(|&c| c == 1023));
+  }
+
+  #[test]
+  fn p010_luma_downshifts_to_8bit() {
+    // Y=512 at 10 bits, P010-packed (0x8000). After >> 8, the 8-bit
+    // luma is 0x80 = 128.
+    let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512);
+    let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut luma = std::vec![0u8; 16 * 8];
+    let mut sink = MixedSinker::<P010>::new(16, 8)
+      .with_luma(&mut luma)
+      .unwrap();
+    p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    assert!(luma.iter().all(|&l| l == 128));
+  }
+
+  #[test]
+  fn p010_matches_yuv420p10_mixed_sinker_with_shifted_samples() {
+    // Logical equivalence: same samples fed through the two formats
+    // (low-packed as yuv420p10, high-packed as P010) must produce
+    // byte-identical u8 RGB.
+    let w = 16u32;
+    let h = 8u32;
+    let y = 600u16;
+    let u = 400u16;
+    let v = 700u16;
+
+    let (yp_p10, up_p10, vp_p10) = solid_yuv420p10_frame(w, h, y, u, v);
+    let src_p10 = Yuv420p10Frame::new(&yp_p10, &up_p10, &vp_p10, w, h, w, w / 2, w / 2);
+
+    let (yp_p010, uvp_p010) = solid_p010_frame(w, h, y, u, v);
+    let src_p010 = P010Frame::new(&yp_p010, &uvp_p010, w, h, w, w);
+
+    let mut rgb_yuv = std::vec![0u8; (w * h * 3) as usize];
+    let mut rgb_p010 = std::vec![0u8; (w * h * 3) as usize];
+    let mut s_yuv = MixedSinker::<Yuv420p10>::new(w as usize, h as usize)
+      .with_rgb(&mut rgb_yuv)
+      .unwrap();
+    let mut s_p010 = MixedSinker::<P010>::new(w as usize, h as usize)
+      .with_rgb(&mut rgb_p010)
+      .unwrap();
+    yuv420p10_to(&src_p10, true, ColorMatrix::Bt709, &mut s_yuv).unwrap();
+    p010_to(&src_p010, true, ColorMatrix::Bt709, &mut s_p010).unwrap();
+    assert_eq!(rgb_yuv, rgb_p010);
+  }
+
+  #[test]
+  fn p010_rgb_u16_too_short_returns_err() {
+    let mut rgb = std::vec![0u16; 10];
+    let err = MixedSinker::<P010>::new(16, 8)
+      .with_rgb_u16(&mut rgb)
+      .err()
+      .unwrap();
+    assert!(matches!(err, MixedSinkerError::RgbU16BufferTooShort { .. }));
+  }
+
+  #[test]
+  fn p010_with_simd_false_matches_with_simd_true() {
+    // Stubs delegate to scalar so simd=true and simd=false produce
+    // byte-identical output for now. Real SIMD backends will replace
+    // the stubs — equivalence is preserved by design.
+    let (yp, uvp) = solid_p010_frame(64, 16, 600, 400, 700);
+    let src = P010Frame::new(&yp, &uvp, 64, 16, 64, 64);
+
+    let mut rgb_scalar = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_scalar = std::vec![0u16; 64 * 16 * 3];
+    let mut s_scalar = MixedSinker::<P010>::new(64, 16)
+      .with_simd(false)
+      .with_rgb(&mut rgb_scalar)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_scalar)
+      .unwrap();
+    p010_to(&src, false, ColorMatrix::Bt709, &mut s_scalar).unwrap();
+
+    let mut rgb_simd = std::vec![0u8; 64 * 16 * 3];
+    let mut rgb_u16_simd = std::vec![0u16; 64 * 16 * 3];
+    let mut s_simd = MixedSinker::<P010>::new(64, 16)
+      .with_rgb(&mut rgb_simd)
+      .unwrap()
+      .with_rgb_u16(&mut rgb_u16_simd)
+      .unwrap();
+    p010_to(&src, false, ColorMatrix::Bt709, &mut s_simd).unwrap();
+
+    assert_eq!(rgb_scalar, rgb_simd);
+    assert_eq!(rgb_u16_scalar, rgb_u16_simd);
+  }
 }
diff --git a/src/yuv/mod.rs b/src/yuv/mod.rs
index 655b706..eedc2ab 100644
--- a/src/yuv/mod.rs
+++ b/src/yuv/mod.rs
@@ -10,15 +10,20 @@
 //!   chroma (Android MediaCodec default).
 //! - [`Yuv420p10`](crate::yuv::Yuv420p10) — 4:2:0 planar at 10 bits
 //!   per sample (HDR10 / 10‑bit SDR software decode).
+//! - [`P010`](crate::yuv::P010) — 4:2:0 semi‑planar at 10 bits per
+//!   sample, high‑bit‑packed (HDR hardware decode: VideoToolbox,
+//!   VA‑API, NVDEC, D3D11VA, Intel QSV).
 //!
 //! Other families land in follow-up commits.
 
 mod nv12;
 mod nv21;
+mod p010;
 mod yuv420p;
 mod yuv420p10;
 
 pub use nv12::{Nv12, Nv12Row, Nv12Sink, nv12_to};
 pub use nv21::{Nv21, Nv21Row, Nv21Sink, nv21_to};
+pub use p010::{P010, P010Row, P010Sink, p010_to};
 pub use yuv420p::{Yuv420p, Yuv420pRow, Yuv420pSink, yuv420p_to};
 pub use yuv420p10::{Yuv420p10, Yuv420p10Row, Yuv420p10Sink, yuv420p10_to};
diff --git a/src/yuv/p010.rs b/src/yuv/p010.rs
new file mode 100644
index 0000000..744c6ce
--- /dev/null
+++ b/src/yuv/p010.rs
@@ -0,0 +1,150 @@
+//! P010 — semi‑planar 4:2:0, 10‑bit, high‑bit‑packed
+//! (`AV_PIX_FMT_P010LE`).
+//!
+//! Storage is a 2‑plane layout: one full‑size Y plane plus one
+//! interleaved UV plane at half width and half height. Sample width
+//! is `u16` with the 10 active bits in the **high** 10 positions of
+//! each element (`sample = value << 6`), low 6 bits zero. This is
+//! Microsoft's P010 convention and what every HDR hardware decoder
+//! emits — Apple VideoToolbox, VA‑API, NVDEC, D3D11VA, Intel QSV.
+//!
+//! Conversion semantics mirror [`super::Nv12`] on the layout side and
+//! [`super::Yuv420p10`] on the Q‑math side: two consecutive Y rows
+//! share one UV row (4:2:0), chroma is nearest‑neighbor upsampled in
+//! registers inside the row primitive, and every SIMD backend shifts
+//! each `u16` load right by 6 to extract the 10‑bit value before
+//! running the same Q15 pipeline used by [`super::Yuv420p10`].
+
+use crate::{ColorMatrix, PixelSink, SourceFormat, frame::P010Frame, sealed::Sealed};
+
+/// Zero‑sized marker for the P010 source format. Used as the `F` type
+/// parameter on [`crate::sinker::MixedSinker`].
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)]
+pub struct P010;
+
+impl Sealed for P010 {}
+impl SourceFormat for P010 {}
+
+/// One output row of a P010 source handed to a [`P010Sink`].
+///
+/// Accessors:
+/// - [`y`](Self::y) — full‑width Y row (`width` `u16` samples, high‑
+///   bit‑packed).
+/// - [`uv_half`](Self::uv_half) — **interleaved, half‑width** UV row
+///   (`width` `u16` elements = `width / 2` U/V pairs, U first). The
+///   row primitive deinterleaves and upsamples in‑register.
+/// - [`row`](Self::row) — output row index (`0 ..= frame.height() - 1`).
+/// - [`matrix`](Self::matrix), [`full_range`](Self::full_range) —
+///   carried through from the kernel call.
+#[derive(Debug, Clone, Copy)]
+pub struct P010Row<'a> {
+  y: &'a [u16],
+  uv_half: &'a [u16],
+  row: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+}
+
+impl<'a> P010Row<'a> {
+  /// Bundles one row of a P010 source for a [`P010Sink`].
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub(crate) fn new(
+    y: &'a [u16],
+    uv_half: &'a [u16],
+    row: usize,
+    matrix: ColorMatrix,
+    full_range: bool,
+  ) -> Self {
+    Self {
+      y,
+      uv_half,
+      row,
+      matrix,
+      full_range,
+    }
+  }
+
+  /// Full‑width Y (luma) row — `width` `u16` samples, high‑bit‑packed
+  /// (10 active bits in the high 10 of each element).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn y(&self) -> &'a [u16] {
+    self.y
+  }
+
+  /// Interleaved UV row — `width` `u16` elements laid out as
+  /// `U0, V0, U1, V1, …, U_{w/2-1}, V_{w/2-1}`. Each element is
+  /// high‑bit‑packed.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn uv_half(&self) -> &'a [u16] {
+    self.uv_half
+  }
+
+  /// Output row index within the frame.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn row(&self) -> usize {
+    self.row
+  }
+
+  /// YUV → RGB matrix carried through from the kernel call.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn matrix(&self) -> ColorMatrix {
+    self.matrix
+  }
+
+  /// `true` iff Y uses the full sample range (`[0, 1023]` for 10‑bit,
+  /// scaled into the high 10 bits of each `u16`); `false` for limited
+  /// range.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn full_range(&self) -> bool {
+    self.full_range
+  }
+}
+
+/// Sinks that consume P010 rows.
+///
+/// A subtrait of [`PixelSink`] that pins the row shape to
+/// [`P010Row`]. Implementors get
+/// `process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error>`
+/// via the supertrait.
+pub trait P010Sink: for<'a> PixelSink<Input<'a> = P010Row<'a>> {}
+
+/// Converts a P010 frame by walking its rows and feeding each one to
+/// the [`P010Sink`].
+///
+/// The kernel is a pure row walker — no color arithmetic happens
+/// here. Slice math picks the Y row and the correct UV row for each
+/// output row (`chroma_row = row / 2` for 4:2:0) and hands borrows to
+/// the Sink. The Sink decides what to derive and where to write.
+pub fn p010_to<S: P010Sink>(
+  src: &P010Frame<'_>,
+  full_range: bool,
+  matrix: ColorMatrix,
+  sink: &mut S,
+) -> Result<(), S::Error> {
+  sink.begin_frame(src.width(), src.height())?;
+
+  let w = src.width() as usize;
+  let h = src.height() as usize;
+  let y_stride = src.y_stride() as usize;
+  let uv_stride = src.uv_stride() as usize;
+  // UV row payload is `width` `u16` elements — `width / 2` interleaved
+  // U/V pairs.
+  let uv_row_elems = w;
+
+  let y_plane = src.y();
+  let uv_plane = src.uv();
+
+  for row in 0..h {
+    let y_start = row * y_stride;
+    let y = &y_plane[y_start..y_start + w];
+
+    // 4:2:0 chroma subsampling: two consecutive Y rows share one UV
+    // row.
+    let chroma_row = row / 2;
+    let uv_start = chroma_row * uv_stride;
+    let uv_half = &uv_plane[uv_start..uv_start + uv_row_elems];
+
+    sink.process(P010Row::new(y, uv_half, row, matrix, full_range))?;
+  }
+  Ok(())
+}