diff --git a/Cargo.toml b/Cargo.toml index 6f6464b..3189008 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,11 @@ path = "benches/srt.rs" name = "srt" harness = false +[[bench]] +path = "benches/vtt.rs" +name = "vtt" +harness = false + [features] default = ["std"] alloc = ["memchr"] diff --git a/README.md b/README.md index 45e9e6a..8177d26 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,41 @@ fasrt = "0.2" | `alloc` | No | Enables `CueText` DOM tree and entity decoding without `std` | | `memchr` | Yes (via `alloc`/`std`) | SIMD-accelerated fast path for entity decoding | +## Benchmarks + +Measured on Apple Silicon with `cargo bench` (Criterion). + +### SRT + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse (strict) | 2 cues, 89 B | ~170 ns | 520 MiB/s | +| Parse (strict) | 26 KB file | ~38 µs | 661 MiB/s | +| Parse (lossy) | 332 files, ~8 MB | ~12.1 ms | 646 MiB/s | +| Collect into `Vec` | 26 KB file | ~40 µs | 616 MiB/s | + +### WebVTT + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse | 2 cues, 96 B | ~318 ns | 291 MiB/s | +| Parse | Settings + region + style, 354 B | ~915 ns | 387 MiB/s | +| Parse | All WPT fixtures, ~34 KB | ~113 µs | 314 MiB/s | +| Collect into `Vec` | Settings + region + style, 354 B | ~973 ns | 364 MiB/s | + +### Cue Text + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse | Tags only, 166 B | ~316 ns | 552 MiB/s | +| Parse | 500 timestamps, ~11 KB | ~14.1 µs | 776 MiB/s | + +Run benchmarks yourself: + +```sh +cargo bench +``` + #### License `fasrt` is under the terms of both the MIT license and the diff --git a/benches/srt.rs b/benches/srt.rs index f328e4d..55bb31e 100644 --- a/benches/srt.rs +++ b/benches/srt.rs @@ -1 +1,103 @@ -fn main() {} +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use fasrt::srt::Parser; + +const SMALL_SRT: &str = "\ +1 +00:00:01,000 --> 00:00:04,000 +Hello world! + +2 +00:00:05,000 --> 00:00:08,000 +Goodbye world! +"; + +const MEDIUM_SRT: &str = include_str!("../fixtures/srt/DeathNote_01.eng.srt"); + +fn load_all_fixtures() -> String { + if let Ok(read_dir) = std::fs::read_dir("fixtures/srt") { + let mut paths: Vec<_> = read_dir + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension().is_some_and(|e| e == "srt") { + Some(path) + } else { + None + } + }) + .collect(); + + paths.sort(); + + let mut buf = String::new(); + for path in paths { + if let Ok(content) = std::fs::read_to_string(path) { + if !buf.is_empty() { + buf.push_str("\n\n"); + } + buf.push_str(&content); + } + } + buf + } else { + // Fallback: use a small embedded sample when fixtures are unavailable. + SMALL_SRT.to_string() + } +} + +fn bench_srt_parse(c: &mut Criterion) { + let all_fixtures = load_all_fixtures(); + + let mut group = c.benchmark_group("srt/parse"); + + // Small inline SRT + group.throughput(Throughput::Bytes(SMALL_SRT.len() as u64)); + group.bench_function(BenchmarkId::new("strict", "small_2_cues"), |b| { + b.iter(|| { + let count = Parser::strict(black_box(SMALL_SRT)).count(); + black_box(count); + }); + }); + + // Medium real file (~26 KB) + group.throughput(Throughput::Bytes(MEDIUM_SRT.len() as u64)); + group.bench_function(BenchmarkId::new("strict", "medium_26kb"), |b| { + b.iter(|| { + let count = Parser::strict(black_box(MEDIUM_SRT)).count(); + black_box(count); + }); + }); + + // All fixtures (~8 MB) + group.throughput(Throughput::Bytes(all_fixtures.len() as u64)); + group.bench_function(BenchmarkId::new("lossy", "all_fixtures_8mb"), |b| { + b.iter(|| { + let count = Parser::lossy(black_box(&all_fixtures)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +fn bench_srt_collect(c: &mut Criterion) { + let mut group = c.benchmark_group("srt/collect"); + + // Collect into Vec to measure allocation overhead + group.throughput(Throughput::Bytes(MEDIUM_SRT.len() as u64)); + group.bench_function("medium_26kb", |b| { + b.iter(|| { + let entries: Vec<_> = Parser::strict(black_box(MEDIUM_SRT)) + .collect::>() + .unwrap(); + black_box(entries.len()); + }); + }); + + group.finish(); +} + +criterion_group!(benches, bench_srt_parse, bench_srt_collect); +criterion_main!(benches); diff --git a/benches/vtt.rs b/benches/vtt.rs new file mode 100644 index 0000000..6b6d2d8 --- /dev/null +++ b/benches/vtt.rs @@ -0,0 +1,176 @@ +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use fasrt::vtt::Parser; +use fasrt::vtt::cue::CueParser; + +const SMALL_VTT: &str = "\ +WEBVTT + +00:00:00.000 --> 00:00:01.000 +Hello world! + +00:00:01.000 --> 00:00:02.000 +Goodbye world! +"; + +const SETTINGS_VTT: &str = "\ +WEBVTT + +STYLE +::cue { color: white; } + +REGION +id:region1 +width:40% +lines:3 +regionanchor:0%,100% +viewportanchor:10%,90% +scroll:up + +cue-1 +00:00:00.000 --> 00:00:01.000 align:start position:10% size:80% line:0 vertical:rl region:region1 +Bold and italic text + +NOTE This is a comment + +00:00:01.000 --> 00:00:05.000 +Second cue with voice tag +"; + +fn load_all_vtt_fixtures() -> String { + let mut buf = String::new(); + for dir in &[ + "fixtures/webvtt/wpt-file-parsing", + "fixtures/webvtt/wpt-cue-parsing", + ] { + if let Ok(read_dir) = std::fs::read_dir(dir) { + let mut entries: Vec<_> = read_dir.filter_map(Result::ok).collect(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + for entry in entries { + if entry.path().extension().is_some_and(|e| e == "vtt") { + if let Ok(contents) = std::fs::read_to_string(entry.path()) { + buf.push_str(&contents); + buf.push_str("\n\n"); + } + } + } + } + } + if buf.is_empty() { + // Fallback to embedded samples so benches still run without fixtures. + buf.push_str(SMALL_VTT); + buf.push_str("\n\n"); + buf.push_str(SETTINGS_VTT); + } + buf +} + +fn bench_vtt_parse(c: &mut Criterion) { + let all_fixtures = load_all_vtt_fixtures(); + + let mut group = c.benchmark_group("vtt/parse"); + + // Small inline VTT + group.throughput(Throughput::Bytes(SMALL_VTT.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "small_2_cues"), |b| { + b.iter(|| { + let count = Parser::new(black_box(SMALL_VTT)).count(); + black_box(count); + }); + }); + + // VTT with settings, regions, styles, cue options + group.throughput(Throughput::Bytes(SETTINGS_VTT.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "with_settings"), |b| { + b.iter(|| { + let count = Parser::new(black_box(SETTINGS_VTT)).count(); + black_box(count); + }); + }); + + // All VTT fixtures + group.throughput(Throughput::Bytes(all_fixtures.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "all_fixtures"), |b| { + b.iter(|| { + let count = Parser::new(black_box(&all_fixtures)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +fn bench_vtt_collect(c: &mut Criterion) { + let mut group = c.benchmark_group("vtt/collect"); + + group.throughput(Throughput::Bytes(SETTINGS_VTT.len() as u64)); + group.bench_function("with_settings", |b| { + b.iter(|| { + let blocks: Vec<_> = Parser::new(black_box(SETTINGS_VTT)) + .collect::>() + .unwrap(); + black_box(blocks.len()); + }); + }); + + group.finish(); +} + +/// Cue text with many timestamp tags to benchmark the cue-text parsing path. +fn build_cue_text_with_timestamps() -> String { + let mut s = String::new(); + for i in 0..500 { + let h = i / 3600; + let m = (i % 3600) / 60; + let sec = i % 60; + s.push_str(&format!( + "word <{h:02}:{m:02}:{sec:02}.{ms:03}>text", + h = h, + m = m, + sec = sec, + ms = (i * 7) % 1000 + )); + } + s +} + +/// Cue text with tags but no timestamps. +const CUE_TEXT_TAGS: &str = "\ +bold bold-italic plain underline \ +voice english baseruby \ +classed & < >   end"; + +fn bench_vtt_cue_text(c: &mut Criterion) { + let ts_input = build_cue_text_with_timestamps(); + + let mut group = c.benchmark_group("vtt/cue_text"); + + // Tags only (no timestamps) + group.throughput(Throughput::Bytes(CUE_TEXT_TAGS.len() as u64)); + group.bench_function("tags_only", |b| { + b.iter(|| { + let count = CueParser::new(black_box(CUE_TEXT_TAGS)).count(); + black_box(count); + }); + }); + + // Timestamp-heavy cue text + group.throughput(Throughput::Bytes(ts_input.len() as u64)); + group.bench_function("500_timestamps", |b| { + b.iter(|| { + let count = CueParser::new(black_box(&ts_input)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_vtt_parse, + bench_vtt_collect, + bench_vtt_cue_text +); +criterion_main!(benches); diff --git a/ci/miri_sb.sh b/ci/miri_sb.sh index cc3c6e0..de9aa95 100755 --- a/ci/miri_sb.sh +++ b/ci/miri_sb.sh @@ -35,4 +35,6 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check" -cargo miri test --all-targets --target "$TARGET" +cargo miri test --lib --tests --target "$TARGET" + +cargo miri test --doc --target "$TARGET" diff --git a/ci/miri_tb.sh b/ci/miri_tb.sh index 5d374c7..a9c8beb 100755 --- a/ci/miri_tb.sh +++ b/ci/miri_tb.sh @@ -35,4 +35,6 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check -Zmiri-tree-borrows" -cargo miri test --all-targets --target "$TARGET" +cargo miri test --lib --tests --target "$TARGET" + +cargo miri test --doc --target "$TARGET" diff --git a/src/error.rs b/src/error.rs index 9dee2bb..ad6775a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -38,18 +38,31 @@ pub enum ParseSecondError { } /// The error type for parsing hour components of timestamps. +/// +/// This enum is shared by both SRT and WebVTT parsers: +/// - **SRT** hours are 2–3 digits (0–999): uses [`NotPadded`](Self::NotPadded) +/// and [`Overflow(u16)`](Self::Overflow). +/// - **WebVTT** hours are unbounded (`u64`): uses [`NotPadded`](Self::NotPadded) +/// for non-digit input and [`HourOverflow`](Self::HourOverflow) when the +/// value exceeds `u64::MAX`. #[derive(Debug, Clone, PartialEq, Eq, IsVariant, Unwrap, TryUnwrap, thiserror::Error)] #[unwrap(ref, ref_mut)] #[try_unwrap(ref, ref_mut)] pub enum ParseHourError { - /// The hour component is not zero-padded to 2 digits. - #[error("hour component is not zero-padded to 2 digits")] + /// The hour component is not zero-padded to 2 digits (SRT), + /// or contains non-digit characters (VTT). + #[error("hour component is not zero-padded to 2 digits or contains invalid characters")] #[unwrap(ignore)] #[try_unwrap(ignore)] NotPadded, - /// The hour component is out of range (not between 0-999). + /// The hour component is out of the SRT range (0–999). #[error("hour component must be between 0-999, but was {0}")] Overflow(u16), + /// The hour component overflowed `u64` (VTT unbounded hours). + #[error("hour component overflowed")] + #[unwrap(ignore)] + #[try_unwrap(ignore)] + HourOverflow, /// Not a valid number. #[error(transparent)] ParseInt(#[from] ParseIntError), @@ -73,6 +86,26 @@ pub enum ParseMillisecondError { ParseInt(#[from] ParseIntError), } +/// Specific reason why a VTT timestamp has invalid structure. +/// +/// This covers structural validation errors only (length, separators, digits). +/// Component range errors (hours, minutes, seconds, milliseconds) are +/// represented by their dedicated error types ([`ParseHourError`], +/// [`ParseMinuteError`], [`ParseSecondError`], [`ParseMillisecondError`]) +/// which are separate variants of [`crate::vtt::ParseVttError`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)] +pub enum TimestampError { + /// The input is too short or has an invalid length for a VTT timestamp. + #[error("invalid length")] + InvalidLength, + /// A separator (`.` or `:`) is not in the expected position. + #[error("invalid format")] + InvalidFormat, + /// One or more digit positions contain non-digit bytes. + #[error("invalid digits")] + InvalidDigits, +} + /// The error type for parsing index numbers of subtitles. #[derive(Debug, Clone, PartialEq, Eq, IsVariant, Unwrap, TryUnwrap, thiserror::Error)] #[unwrap(ref, ref_mut)] diff --git a/src/srt.rs b/src/srt.rs index 29291a2..7f01ddf 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -3,6 +3,7 @@ use core::num::NonZeroU64; use logos::{Lexer, Logos}; use crate::error::*; +use crate::types::{Millisecond, Minute, Second}; pub use types::{Entry, Header, Hour, Timestamp}; @@ -293,7 +294,7 @@ impl Default for Options { enum Token { /// Header "HH:MM:SS,mmm --> HH:MM:SS,mmm" #[regex( - r"[0-9]{1,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}[ \t\x0C]+-->[ \t\x0C]+[0-9]{1,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}", + r"[0-9]{2,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}[ \t\x0C]+-->[ \t\x0C]+[0-9]{2,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}", parse_header )] Header(Header), @@ -322,35 +323,45 @@ fn parse_number(s: &mut Lexer<'_, Token>) -> Result { #[inline] fn parse_header(s: &mut Lexer<'_, Token>) -> Result { - let s = s.slice().trim(); - let mut parts = s.split(" --> "); - match (parts.next(), parts.next()) { - (Some(start), Some(end)) => { - let start = parse_timestamp(start)?; - let end = parse_timestamp(end)?; - Ok(Header::new(start, end)) - } - _ => panic!("logos regex should guarantee this never happens"), - } + let slice = s.slice(); + // The regex guarantees `HH+:MM:SS,mmm --> HH+:MM:SS,mmm`. + // Find the arrow separator and extract both timestamps via byte arithmetic. + let arrow = slice.find("-->").unwrap(); + let start_str = slice[..arrow].trim(); + let end_str = slice[arrow + 3..].trim(); + let start = parse_timestamp_bytes(start_str.as_bytes())?; + let end = parse_timestamp_bytes(end_str.as_bytes())?; + Ok(Header::new(start, end)) } +/// Parse an SRT timestamp from raw bytes using direct digit extraction. +/// +/// Format: `H+:MM:SS,mmm` where H is 2-3 digits (regex-validated), +/// MM and SS are exactly 2 digits, mmm is exactly 3 digits. +/// Layout from the end: `mmm`(3) `,`(1) `SS`(2) `:`(1) `MM`(2) `:`(1) = 10 fixed bytes. #[inline] -fn parse_timestamp(s: &str) -> Result { - let mut parts = s.split(","); - - match (parts.next(), parts.next()) { - (Some(hms), Some(millis)) => { - let mut hms_parts = hms.split(':'); - - let (h, m, s) = match (hms_parts.next(), hms_parts.next(), hms_parts.next()) { - (Some(h), Some(m), Some(s)) => (h.parse()?, m.parse()?, s.parse()?), - _ => panic!("logos regex should guarantee this never happens"), - }; - let millis = millis.parse()?; - Ok(Timestamp::from_hmsm(h, m, s, millis)) - } - _ => panic!("logos regex should guarantee this never happens"), - } +fn parse_timestamp_bytes(b: &[u8]) -> Result { + let len = b.len(); + let millis = Millisecond(digit3(&b[len - 3..])); + let seconds = Second(digit2(&b[len - 6..len - 4])); + let minutes = Minute(digit2(&b[len - 9..len - 7])); + let hour_len = len - 10; + let hours = match hour_len { + 2 => Hour(digit2(&b[..2]) as u16), + 3 => Hour(digit3(&b[..3])), + _ => return Err(ParseHourError::NotPadded.into()), + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn digit2(b: &[u8]) -> u8 { + (b[0] - b'0') * 10 + (b[1] - b'0') +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn digit3(b: &[u8]) -> u16 { + (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 } struct StateBody { @@ -736,18 +747,23 @@ impl<'a> Iterator for Lines<'a> { return None; } - let remaining = &self.input[self.pos..]; - let line_end = remaining - .find('\n') + let bytes = &self.input.as_bytes()[self.pos..]; + + #[cfg(all(feature = "memchr", not(miri)))] + let found = memchr::memchr(b'\n', bytes); + #[cfg(not(all(feature = "memchr", not(miri))))] + let found = bytes.iter().position(|&b| b == b'\n'); + + let line_end = found .map(|i| { - let end = if i > 0 && remaining.as_bytes()[i - 1] == b'\r' { + let end = if i > 0 && bytes[i - 1] == b'\r' { i - 1 } else { i }; (end, i + 1) }) - .unwrap_or((remaining.len(), remaining.len())); + .unwrap_or((bytes.len(), bytes.len())); let line = &self.input[self.pos..self.pos + line_end.0]; self.pos += line_end.1; diff --git a/src/vtt.rs b/src/vtt.rs index 46104dd..52e3cae 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -1,6 +1,9 @@ use logos::{Lexer, Logos}; -use crate::{error::*, types::Millisecond}; +use crate::{ + error::*, + types::{Millisecond, Minute, Second}, +}; pub use types::{ Align, Anchor, Block, Cue, CueId, CueOptions, Header, Hour, Line, LineAlign, LineValue, @@ -42,7 +45,7 @@ pub enum ParseVttError { /// A timestamp could not be parsed. #[error("invalid timestamp: {0}")] - InvalidTimestamp(&'static str), + InvalidTimestamp(TimestampError), /// The timing line is malformed (missing `-->`). #[error("invalid timing line: missing '-->' separator")] @@ -104,36 +107,156 @@ fn split_arrow(s: &str) -> Result<(&str, &str), ParseVttError> { } /// Parse a timestamp in either long form `HH:MM:SS.mmm` or short form -/// `MM:SS.mmm`. Auto-detects by counting `:` separators. +/// `MM:SS.mmm` using direct byte extraction. +/// +/// The logos regex guarantees the format, so we can extract fields by +/// fixed offsets from the end: `mmm`(3) `.`(1) `SS`(2) `:`(1) `MM`(2) +/// = 9 fixed bytes. If there's a `:`(1) before that, everything before +/// it is the hours component. +/// +/// # Safety assumption +/// +/// Callers **must** ensure that `s` has already been validated by a +/// logos regex (which guarantees digit/separator placement). +/// For loosely-validated input (e.g. unterminated cue-text tags), use +/// [`parse_timestamp_cue`] instead. #[inline] pub(crate) fn parse_timestamp(s: &str) -> Result { - let (time, ms) = s - .split_once('.') - .ok_or(ParseVttError::InvalidTimestamp("missing '.'"))?; - let millis: Millisecond = ms.parse()?; - let mut parts = time.splitn(3, ':'); - let first = parts - .next() - .ok_or(ParseVttError::InvalidTimestamp("empty timestamp"))?; - let second = parts.next().ok_or(ParseVttError::InvalidTimestamp( - "missing minutes or seconds", + let b = s.as_bytes(); + let len = b.len(); + if len < 9 { + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidLength, + )); + } + let millis = Millisecond(vtt_digit3(&b[len - 3..])); + let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); + let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); + let hours = if len > 9 { + let hour_str = &s[..len - 10]; + parse_vtt_hour_bytes(hour_str.as_bytes())? + } else { + Hour::new() + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +/// Parse a timestamp from cue-text, where input is only loosely validated +/// by the regex `<[0-9:]+\.[0-9]{3}>`. +/// +/// Unlike [`parse_timestamp`], this validates separators and digit bytes +/// before doing any arithmetic, so it is safe for untrusted input. +#[inline] +pub(crate) fn parse_timestamp_cue(s: &str) -> Result { + let b = s.as_bytes(); + let len = b.len(); + // Short form `MM:SS.mmm` = 9 bytes (0 hour digits). + // Long form `H+:MM:SS.mmm` = 12..=29 bytes (1..=20 hour digits). + // Reject lengths outside [9, 29] and the gap [10, 11]. + if !(9..=29).contains(&len) || (len > 9 && len < 12) { + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidLength, + )); + } + // Validate separators. + if b[len - 4] != b'.' || b[len - 7] != b':' { + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidFormat, + )); + } + // Validate digit bytes before arithmetic. + let millis_val = vtt_digit3_checked(&b[len - 3..]).ok_or(ParseVttError::InvalidTimestamp( + TimestampError::InvalidDigits, ))?; - match parts.next() { - // Long form: HH:MM:SS - Some(sec_str) => Ok(Timestamp::from_hmsm( - first.parse()?, - second.parse()?, - sec_str.parse()?, - millis, - )), - // Short form: MM:SS - None => Ok(Timestamp::from_hmsm( - Hour::new(), - first.parse()?, - second.parse()?, - millis, - )), + let seconds_val = vtt_digit2_checked(&b[len - 6..len - 4]).ok_or( + ParseVttError::InvalidTimestamp(TimestampError::InvalidDigits), + )?; + let minutes_val = vtt_digit2_checked(&b[len - 9..len - 7]).ok_or( + ParseVttError::InvalidTimestamp(TimestampError::InvalidDigits), + )?; + let millis = Millisecond::try_with(millis_val).ok_or(ParseVttError::ParseMillisecond( + ParseMillisecondError::Overflow(millis_val), + ))?; + let seconds = Second::try_with(seconds_val).ok_or(ParseVttError::ParseSecond( + ParseSecondError::Overflow(seconds_val), + ))?; + let minutes = Minute::try_with(minutes_val).ok_or(ParseVttError::ParseMinute( + ParseMinuteError::Overflow(minutes_val), + ))?; + let hours = if len >= 12 { + if b[len - 10] != b':' { + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidFormat, + )); + } + let hour_bytes = &b[..len - 10]; + parse_vtt_hour_bytes(hour_bytes)? + } else { + Hour::new() + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +/// Parse VTT hour bytes (variable-length, unbounded u64). +/// +/// Uses checked arithmetic to prevent overflow on extremely large +/// hour values from untrusted input. +#[cfg_attr(not(tarpaulin), inline(always))] +fn parse_vtt_hour_bytes(b: &[u8]) -> Result { + let mut val: u64 = 0; + for &byte in b { + if !byte.is_ascii_digit() { + return Err(ParseHourError::NotPadded); + } + val = val + .checked_mul(10) + .and_then(|v| v.checked_add((byte - b'0') as u64)) + .ok_or(ParseHourError::HourOverflow)?; } + Ok(Hour(val)) +} + +/// Unchecked 2-digit extraction. Caller must guarantee ASCII digits. +/// +/// Only called from [`parse_timestamp`], which requires logos-regex-validated input. +/// Debug assertions catch misuse during development. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit2(b: &[u8]) -> u8 { + debug_assert!(b[0].is_ascii_digit() && b[1].is_ascii_digit()); + (b[0] - b'0') * 10 + (b[1] - b'0') +} + +/// Unchecked 3-digit extraction. Caller must guarantee ASCII digits. +/// +/// Only called from [`parse_timestamp`], which requires logos-regex-validated input. +/// Debug assertions catch misuse during development. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit3(b: &[u8]) -> u16 { + debug_assert!(b[0].is_ascii_digit() && b[1].is_ascii_digit() && b[2].is_ascii_digit()); + (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 +} + +/// Checked 2-digit extraction. Returns `None` if any byte is not an ASCII digit. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit2_checked(b: &[u8]) -> Option { + let d0 = b[0].wrapping_sub(b'0'); + let d1 = b[1].wrapping_sub(b'0'); + if d0 > 9 || d1 > 9 { + return None; + } + Some(d0 * 10 + d1) +} + +/// Checked 3-digit extraction. Returns `None` if any byte is not an ASCII digit. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit3_checked(b: &[u8]) -> Option { + let d0 = b[0].wrapping_sub(b'0'); + let d1 = b[1].wrapping_sub(b'0'); + let d2 = b[2].wrapping_sub(b'0'); + if d0 > 9 || d1 > 9 || d2 > 9 { + return None; + } + Some(d0 as u16 * 100 + d1 as u16 * 10 + d2 as u16) } /// Lex the next token from a line. Returns `Ok(None)` when the line @@ -1253,28 +1376,27 @@ impl<'a> Iterator for Lines<'a> { } let bytes = &self.input.as_bytes()[self.pos..]; - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'\n' { - let line = &self.input[self.pos..self.pos + i]; - self.pos += i + 1; - return Some(line); - } else if bytes[i] == b'\r' { + + #[cfg(all(feature = "memchr", not(miri)))] + let found = memchr::memchr2(b'\n', b'\r', bytes); + #[cfg(any(not(feature = "memchr"), miri))] + let found = bytes.iter().position(|&b| b == b'\n' || b == b'\r'); + + match found { + Some(i) => { let line = &self.input[self.pos..self.pos + i]; - // Check for CRLF - if i + 1 < bytes.len() && bytes[i + 1] == b'\n' { + if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' { self.pos += i + 2; } else { self.pos += i + 1; } - return Some(line); + Some(line) + } + None => { + let line = &self.input[self.pos..]; + self.pos = self.input.len(); + Some(line) } - i += 1; } - - // No line terminator found - rest of input - let line = &self.input[self.pos..]; - self.pos = self.input.len(); - Some(line) } } diff --git a/src/vtt/cue.rs b/src/vtt/cue.rs index 64483d9..a810b86 100644 --- a/src/vtt/cue.rs +++ b/src/vtt/cue.rs @@ -95,8 +95,9 @@ enum RawCueToken<'a> { StartLang(&'a str), // ── timestamp tag ───────────────────────────────────────────────────── - /// `<[digits/colons].[3 digits]>` — validated by `parse_timestamp` later. - #[regex(r"<[0-9:]+\.[0-9]{3}>")] + /// `` or `` — fully validated by the DFA so + /// the fast unchecked `parse_timestamp` can be used directly. + #[regex(r"<(?:[0-9]+:)?[0-5][0-9]:[0-5][0-9]\.[0-9]{3}>")] Timestamp(&'a str), // ── fallbacks (skipped by the iterator) ─────────────────────────────── @@ -574,6 +575,7 @@ impl<'a> Iterator for CueParser<'a> { // ── timestamp ── Ok(RawCueToken::Timestamp(s)) => { let content = &s[1..s.len() - 1]; // strip `<` and `>` + // Regex already validates format; use unchecked fast path. if let Ok(ts) = super::parse_timestamp(content) { return Some(CueToken::Timestamp(ts)); } @@ -608,7 +610,7 @@ fn try_parse_unterminated<'a>(slice: &'a str) -> Option> { // Try timestamp: digits/colons + "." + 3 digits if inner.as_bytes()[0].is_ascii_digit() { - if let Ok(ts) = super::parse_timestamp(inner) { + if let Ok(ts) = super::parse_timestamp_cue(inner) { return Some(CueToken::Timestamp(ts)); } return None; diff --git a/tests/vtt_cue_text.rs b/tests/vtt_cue_text.rs index 446d53e..5a61641 100644 --- a/tests/vtt_cue_text.rs +++ b/tests/vtt_cue_text.rs @@ -374,3 +374,93 @@ impl CueTokenExt for CueToken<'_> { } } } + +// ── Malformed timestamp rejection tests ────────────────────────────────────── +// +// These verify that malformed cue-text timestamp tags are safely rejected +// (treated as unknown tags or skipped) without panicking, even in debug builds. + +/// Colons where digits are expected: `<:::.000>` matches the old loose regex +/// but is rejected by the tightened DFA. +#[test] +fn cue_text_rejects_colons_as_digits() { + let tokens: Vec<_> = CueParser::new("<:::.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "colons-only should not parse as timestamp" + ); +} + +/// Minutes out of range: `<99:99.000>` — rejected by the DFA (`[0-5][0-9]`). +#[test] +fn cue_text_rejects_out_of_range_minutes() { + let tokens: Vec<_> = CueParser::new("<99:99.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "99:99 should not parse as timestamp" + ); +} + +/// Seconds out of range: `<00:60.000>` — rejected by the DFA. +#[test] +fn cue_text_rejects_out_of_range_seconds() { + let tokens: Vec<_> = CueParser::new("<00:60.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "60 seconds should not parse as timestamp" + ); +} + +/// Non-digit bytes in hour position: ``. +#[test] +fn cue_text_rejects_non_digit_hours() { + let tokens: Vec<_> = CueParser::new("").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "non-digit hours should not parse as timestamp" + ); +} + +/// Empty hour prefix: `<:00:00.000>` — colon where a digit is expected. +#[test] +fn cue_text_rejects_empty_hour_prefix() { + let tokens: Vec<_> = CueParser::new("<:00:00.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "empty hour prefix should not parse as timestamp" + ); +} + +/// Very large hours that would overflow u64: 30-digit hour value. +#[test] +fn cue_text_rejects_overflowing_hours() { + // 30 digits exceeds u64::MAX (20 digits) + let tag = format!("<{}:00:00.000>", "9".repeat(30)); + let tokens: Vec<_> = CueParser::new(&tag).collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "overflowing hours should not parse as timestamp" + ); +} + +/// Unterminated timestamp tag with valid format goes through `parse_timestamp_cue`. +#[test] +fn cue_text_unterminated_valid_timestamp() { + // `<00:05.000` without closing `>` — handled by try_parse_unterminated + let tokens: Vec<_> = CueParser::new("<00:05.000").collect(); + assert!( + tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "unterminated but valid timestamp should parse" + ); +} + +/// Unterminated timestamp with invalid digits goes through `parse_timestamp_cue` +/// and is safely rejected. +#[test] +fn cue_text_unterminated_invalid_timestamp() { + let tokens: Vec<_> = CueParser::new("<99:99.000").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "unterminated invalid timestamp should be rejected" + ); +}