From 411d6f77ece53ecd8c751009a5a46b1d78db792a Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 21 Apr 2026 15:10:20 -0400 Subject: [PATCH] implement smarter sampler Signed-off-by: Connor Tsui --- vortex-btrblocks/src/schemes/integer.rs | 32 +- vortex-compressor/public-api.lock | 28 +- .../src/builtins/constant/float.rs | 2 +- .../src/builtins/constant/string.rs | 2 +- vortex-compressor/src/compressor.rs | 303 ++++++++++++++++-- vortex-compressor/src/ctx.rs | 1 + vortex-compressor/src/estimate.rs | 36 ++- 7 files changed, 350 insertions(+), 54 deletions(-) diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index 7e6c3503239..7dc1fccad80 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -17,6 +17,7 @@ use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; use vortex_compressor::estimate::CompressionEstimate; use vortex_compressor::estimate::DeferredEstimate; +use vortex_compressor::estimate::EstimateScore; use vortex_compressor::estimate::EstimateVerdict; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; @@ -737,20 +738,29 @@ impl Scheme for SequenceScheme { return CompressionEstimate::Verdict(EstimateVerdict::Skip); } - // TODO(connor): Why do we sequence encode the whole thing and then throw it away? And then - // why do we divide the ratio by 2??? - + // TODO(connor): `sequence_encode` allocates the encoded array just to confirm feasibility. + // A cheaper `is_sequence` probe would let us skip the allocation entirely. CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, data, _ctx, exec_ctx| { - let Some(encoded) = sequence_encode(data.array_as_primitive(), exec_ctx)? else { - // If we are unable to sequence encode this array, make sure we skip. + |_compressor, data, best_so_far, _ctx, exec_ctx| { + // `SequenceArray` stores exactly two scalars (base and multiplier), so the best + // achievable compression ratio is `array_len / 2`. + let compressed_size = 2usize; + let max_ratio = data.array_len() as f64 / compressed_size as f64; + + // If we cannot beat the best so far, then we do not want to even try sequence + // encoding the data. + let threshold = best_so_far.and_then(EstimateScore::finite_ratio); + if threshold.is_some_and(|t| max_ratio <= t) { return Ok(EstimateVerdict::Skip); - }; + } - // TODO(connor): This doesn't really make sense? - // Since two values are required to store base and multiplier the compression ratio is - // divided by 2. - Ok(EstimateVerdict::Ratio(encoded.len() as f64 / 2.0)) + // TODO(connor): We should pass this array back to the compressor in the case that + // we do want to sequence encode this so that we do not need to recompress. + if sequence_encode(data.array_as_primitive(), exec_ctx)?.is_none() { + return Ok(EstimateVerdict::Skip); + } + // TODO(connor): Should we get the actual ratio here? + Ok(EstimateVerdict::Ratio(max_ratio)) }, ))) } diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock index 05cbad5ad6f..d93f2c44305 100644 --- a/vortex-compressor/public-api.lock +++ b/vortex-compressor/public-api.lock @@ -326,6 +326,32 @@ impl core::fmt::Debug for vortex_compressor::estimate::DeferredEstimate pub fn vortex_compressor::estimate::DeferredEstimate::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub enum vortex_compressor::estimate::EstimateScore + +pub vortex_compressor::estimate::EstimateScore::FiniteCompression(f64) + +pub vortex_compressor::estimate::EstimateScore::ZeroBytes + +impl vortex_compressor::estimate::EstimateScore + +pub fn vortex_compressor::estimate::EstimateScore::finite_ratio(self) -> core::option::Option + +impl core::clone::Clone for vortex_compressor::estimate::EstimateScore + +pub fn vortex_compressor::estimate::EstimateScore::clone(&self) -> vortex_compressor::estimate::EstimateScore + +impl core::cmp::PartialEq for vortex_compressor::estimate::EstimateScore + +pub fn vortex_compressor::estimate::EstimateScore::eq(&self, other: &vortex_compressor::estimate::EstimateScore) -> bool + +impl core::fmt::Debug for vortex_compressor::estimate::EstimateScore + +pub fn vortex_compressor::estimate::EstimateScore::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::estimate::EstimateScore + +impl core::marker::StructuralPartialEq for vortex_compressor::estimate::EstimateScore + pub enum vortex_compressor::estimate::EstimateVerdict pub vortex_compressor::estimate::EstimateVerdict::AlwaysUse @@ -338,7 +364,7 @@ impl core::fmt::Debug for vortex_compressor::estimate::EstimateVerdict pub fn vortex_compressor::estimate::EstimateVerdict::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub type vortex_compressor::estimate::EstimateFn = (dyn core::ops::function::FnOnce(&vortex_compressor::CascadingCompressor, &vortex_compressor::stats::ArrayAndStats, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + core::marker::Send + core::marker::Sync) +pub type vortex_compressor::estimate::EstimateFn = (dyn core::ops::function::FnOnce(&vortex_compressor::CascadingCompressor, &vortex_compressor::stats::ArrayAndStats, core::option::Option, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + core::marker::Send + core::marker::Sync) pub mod vortex_compressor::scheme diff --git a/vortex-compressor/src/builtins/constant/float.rs b/vortex-compressor/src/builtins/constant/float.rs index 1445a6cca2d..501e3cd9623 100644 --- a/vortex-compressor/src/builtins/constant/float.rs +++ b/vortex-compressor/src/builtins/constant/float.rs @@ -66,7 +66,7 @@ impl Scheme for FloatConstantScheme { // This is an expensive check, but in practice the distinct count is known because we often // include dictionary encoding in our set of schemes, so we rarely call this. CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, data, _ctx, exec_ctx| { + |_compressor, data, _best_so_far, _ctx, exec_ctx| { if is_constant(data.array(), exec_ctx)? { Ok(EstimateVerdict::AlwaysUse) } else { diff --git a/vortex-compressor/src/builtins/constant/string.rs b/vortex-compressor/src/builtins/constant/string.rs index 68ccfe14e8a..fcd6138bca2 100644 --- a/vortex-compressor/src/builtins/constant/string.rs +++ b/vortex-compressor/src/builtins/constant/string.rs @@ -60,7 +60,7 @@ impl Scheme for StringConstantScheme { // This is an expensive check, but the alternative of not compressing a constant array is // far less preferable. CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, data, _ctx, exec_ctx| { + |_compressor, data, _best_so_far, _ctx, exec_ctx| { if is_constant(data.array(), exec_ctx)? { Ok(EstimateVerdict::AlwaysUse) } else { diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 0f67619e3cb..aa18b61637b 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -349,7 +349,17 @@ impl CascadingCompressor { /// Calls [`expected_compression_ratio`] on each candidate and returns the winning scheme along /// with its resolved winner estimate, or `None` if no scheme beats the canonical encoding. - /// Ties are broken by registration order (earlier in the list wins). + /// + /// Selection runs in two passes. Pass 1 evaluates every immediate + /// [`CompressionEstimate::Verdict`] and tracks the running best. [`Scheme`]s returning + /// [`CompressionEstimate::Deferred`] are stashed for pass 2 so that we do not make any + /// expensive computations if we don't have to. + /// + /// Pass 2 evaluates the deferred work and, for each [`DeferredEstimate::Callback`], passes the + /// current best [`EstimateScore`] as an early-exit hint so the callback can return + /// [`EstimateVerdict::Skip`] without doing expensive work when it cannot beat the threshold. + /// + /// Ties are broken by registration order within each pass. /// /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio fn choose_best_scheme( @@ -360,40 +370,61 @@ impl CascadingCompressor { exec_ctx: &mut ExecutionCtx, ) -> VortexResult> { let mut best: Option<(&'static dyn Scheme, EstimateScore)> = None; + let mut deferred: Vec<(&'static dyn Scheme, DeferredEstimate)> = Vec::new(); - // TODO(connor): Rather than computing the deferred estimates eagerly, it would be better to - // look at all quick estimates and see if it makes sense to sample at all. + // Pass 1: evaluate every immediate verdict. Stash deferred work for pass 2. for &scheme in schemes { - let verdict = - match scheme.expected_compression_ratio(data, compress_ctx.clone(), exec_ctx) { - CompressionEstimate::Verdict(verdict) => verdict, - CompressionEstimate::Deferred(DeferredEstimate::Sample) => { - let score = estimate_compression_ratio_with_sampling( - scheme, - self, - data.array(), - compress_ctx.clone(), - exec_ctx, - )?; - if is_better_score(score, &best) { - best = Some((scheme, score)); - } - continue; - } - CompressionEstimate::Deferred(DeferredEstimate::Callback(callback)) => { - callback(self, data, compress_ctx.clone(), exec_ctx)? + match scheme.expected_compression_ratio(data, compress_ctx.clone(), exec_ctx) { + CompressionEstimate::Verdict(EstimateVerdict::Skip) => {} + CompressionEstimate::Verdict(EstimateVerdict::AlwaysUse) => { + return Ok(Some((scheme, WinnerEstimate::AlwaysUse))); + } + CompressionEstimate::Verdict(EstimateVerdict::Ratio(ratio)) => { + let score = EstimateScore::FiniteCompression(ratio); + + if is_better_score(score, &best) { + best = Some((scheme, score)); } - }; + } + CompressionEstimate::Deferred(deferred_estimate) => { + deferred.push((scheme, deferred_estimate)); + } + } + } + + // Pass 2: run deferred work. Callbacks receive the current best as a threshold so they can + // short-circuit with `Skip` when they cannot beat it. + for (scheme, deferred_estimate) in deferred { + let threshold: Option = best.map(|(_, score)| score); + match deferred_estimate { + DeferredEstimate::Sample => { + let score = estimate_compression_ratio_with_sampling( + self, + scheme, + data.array(), + compress_ctx.clone(), + exec_ctx, + )?; - match verdict { - EstimateVerdict::Skip => {} - EstimateVerdict::AlwaysUse => return Ok(Some((scheme, WinnerEstimate::AlwaysUse))), - EstimateVerdict::Ratio(ratio) => { - let score = EstimateScore::FiniteCompression(ratio); if is_better_score(score, &best) { best = Some((scheme, score)); } } + DeferredEstimate::Callback(callback) => { + match callback(self, data, threshold, compress_ctx.clone(), exec_ctx)? { + EstimateVerdict::Skip => {} + EstimateVerdict::AlwaysUse => { + return Ok(Some((scheme, WinnerEstimate::AlwaysUse))); + } + EstimateVerdict::Ratio(ratio) => { + let score = EstimateScore::FiniteCompression(ratio); + + if is_better_score(score, &best) { + best = Some((scheme, score)); + } + } + } + } } } @@ -757,7 +788,7 @@ mod tests { _exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, _data, _ctx, _exec_ctx| Ok(EstimateVerdict::AlwaysUse), + |_compressor, _data, _ctx, _exec_ctx, _best_so_far| Ok(EstimateVerdict::AlwaysUse), ))) } @@ -791,7 +822,7 @@ mod tests { _exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, _data, _ctx, _exec_ctx| Ok(EstimateVerdict::Skip), + |_compressor, _data, _ctx, _exec_ctx, _best_so_far| Ok(EstimateVerdict::Skip), ))) } @@ -825,7 +856,7 @@ mod tests { _exec_ctx: &mut ExecutionCtx, ) -> CompressionEstimate { CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( - |_compressor, _data, _ctx, _exec_ctx| Ok(EstimateVerdict::Ratio(3.0)), + |_compressor, _data, _ctx, _exec_ctx, _best_so_far| Ok(EstimateVerdict::Ratio(3.0)), ))) } @@ -1201,6 +1232,216 @@ mod tests { Ok(()) } + // Observer helper used by threshold-related tests. Captures the `best_so_far` value the + // compressor passes to its deferred callback. `OBSERVER_LOCK` serializes tests that share + // `OBSERVED_THRESHOLD` so they do not race. + static OBSERVER_LOCK: Mutex<()> = Mutex::new(()); + static OBSERVED_THRESHOLD: Mutex>> = Mutex::new(None); + + #[derive(Debug)] + struct ThresholdObservingScheme; + + impl Scheme for ThresholdObservingScheme { + fn scheme_name(&self) -> &'static str { + "test.threshold_observing" + } + + fn matches(&self, canonical: &Canonical) -> bool { + matches_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( + |_compressor, _data, best_so_far, _ctx, _exec_ctx| { + *OBSERVED_THRESHOLD.lock() = Some(best_so_far); + Ok(EstimateVerdict::Skip) + }, + ))) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + unreachable!("test helper should never be selected for compression") + } + } + + #[derive(Debug)] + struct CallbackMatchingRatioScheme; + + impl Scheme for CallbackMatchingRatioScheme { + fn scheme_name(&self) -> &'static str { + "test.callback_matching_ratio" + } + + fn matches(&self, canonical: &Canonical) -> bool { + matches_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( + |_compressor, _data, _ctx, _exec_ctx, _best_so_far| Ok(EstimateVerdict::Ratio(2.0)), + ))) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + unreachable!("test helper should never be selected for compression") + } + } + + #[test] + fn callback_always_use_overrides_pass_one_best() -> VortexResult<()> { + // `HugeRatioScheme` returns an immediate `Ratio(100.0)` in pass 1; + // `CallbackAlwaysUseScheme` returns `AlwaysUse` from its deferred callback in pass 2. + // The deferred `AlwaysUse` must still win. + let compressor = CascadingCompressor::new(vec![&HugeRatioScheme, &CallbackAlwaysUseScheme]); + let schemes: [&'static dyn Scheme; 2] = [&HugeRatioScheme, &CallbackAlwaysUseScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + let winner = compressor.choose_best_scheme( + &schemes, + &data, + CompressorContext::new(), + &mut exec_ctx, + )?; + + assert!(matches!( + winner, + Some((scheme, WinnerEstimate::AlwaysUse)) + if scheme.id() == CallbackAlwaysUseScheme.id() + )); + Ok(()) + } + + #[test] + fn threshold_reflects_pass_one_best() -> VortexResult<()> { + let _guard = OBSERVER_LOCK.lock(); + *OBSERVED_THRESHOLD.lock() = None; + + let compressor = + CascadingCompressor::new(vec![&DirectRatioScheme, &ThresholdObservingScheme]); + let schemes: [&'static dyn Scheme; 2] = [&DirectRatioScheme, &ThresholdObservingScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + compressor.choose_best_scheme(&schemes, &data, CompressorContext::new(), &mut exec_ctx)?; + + let observed = *OBSERVED_THRESHOLD.lock(); + assert!(matches!( + observed, + Some(Some(EstimateScore::FiniteCompression(r))) if r == 2.0 + )); + Ok(()) + } + + #[test] + fn threshold_is_none_when_only_prior_is_zero_bytes() -> VortexResult<()> { + let _guard = OBSERVER_LOCK.lock(); + *OBSERVED_THRESHOLD.lock() = None; + + let compressor = + CascadingCompressor::new(vec![&ZeroBytesSamplingScheme, &ThresholdObservingScheme]); + let schemes: [&'static dyn Scheme; 2] = + [&ZeroBytesSamplingScheme, &ThresholdObservingScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + compressor.choose_best_scheme(&schemes, &data, CompressorContext::new(), &mut exec_ctx)?; + + // The observing callback was invoked (outer `Some`) and `best_so_far` was `None` (inner + // `None`) because the zero-byte sample is never stored as the best. + let observed = *OBSERVED_THRESHOLD.lock(); + assert_eq!(observed, Some(None)); + Ok(()) + } + + #[test] + fn threshold_is_none_when_no_prior_scheme() -> VortexResult<()> { + let _guard = OBSERVER_LOCK.lock(); + *OBSERVED_THRESHOLD.lock() = None; + + let compressor = CascadingCompressor::new(vec![&ThresholdObservingScheme]); + let schemes: [&'static dyn Scheme; 1] = [&ThresholdObservingScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + compressor.choose_best_scheme(&schemes, &data, CompressorContext::new(), &mut exec_ctx)?; + + let observed = *OBSERVED_THRESHOLD.lock(); + assert_eq!(observed, Some(None)); + Ok(()) + } + + #[test] + fn threshold_updates_from_earlier_deferred_callback() -> VortexResult<()> { + let _guard = OBSERVER_LOCK.lock(); + *OBSERVED_THRESHOLD.lock() = None; + + // Both schemes are deferred. The first callback registers `Ratio(3.0)`; the second + // callback must observe it as its threshold. + let compressor = + CascadingCompressor::new(vec![&CallbackRatioScheme, &ThresholdObservingScheme]); + let schemes: [&'static dyn Scheme; 2] = [&CallbackRatioScheme, &ThresholdObservingScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + compressor.choose_best_scheme(&schemes, &data, CompressorContext::new(), &mut exec_ctx)?; + + let observed = *OBSERVED_THRESHOLD.lock(); + assert!(matches!( + observed, + Some(Some(EstimateScore::FiniteCompression(r))) if r == 3.0 + )); + Ok(()) + } + + #[test] + fn ratio_tie_between_immediate_and_deferred_favors_immediate() -> VortexResult<()> { + // Both schemes produce the same `Ratio(2.0)`, one from pass 1 (immediate) and one from + // pass 2 (deferred callback). Pass 1 locks in first, and strict `>` tie-breaking means + // the deferred callback's equal ratio cannot displace it. + let compressor = + CascadingCompressor::new(vec![&CallbackMatchingRatioScheme, &DirectRatioScheme]); + let schemes: [&'static dyn Scheme; 2] = [&CallbackMatchingRatioScheme, &DirectRatioScheme]; + let data = estimate_test_data(); + let mut exec_ctx = SESSION.create_execution_ctx(); + + let winner = compressor.choose_best_scheme( + &schemes, + &data, + CompressorContext::new(), + &mut exec_ctx, + )?; + + assert!(matches!( + winner, + Some((scheme, WinnerEstimate::Score(EstimateScore::FiniteCompression(r)))) + if scheme.id() == DirectRatioScheme.id() && r == 2.0 + )); + Ok(()) + } + #[test] fn all_null_array_compresses_to_constant() -> VortexResult<()> { let array = PrimitiveArray::new( @@ -1244,8 +1485,8 @@ mod tests { // "this must be present since `DictScheme` declared that we need distinct values" let mut exec_ctx = SESSION.create_execution_ctx(); let score = estimate_compression_ratio_with_sampling( - &FloatDictScheme, &compressor, + &FloatDictScheme, &array, ctx, &mut exec_ctx, diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs index 576fc8002a6..4e0619ff8ee 100644 --- a/vortex-compressor/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -30,6 +30,7 @@ pub struct CompressorContext { /// Merged stats options from all eligible schemes at this compression site. merged_stats_options: GenerateStatsOptions, + // TODO(connor): Replace this with an `im::Vector` /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules /// ([`ancestor_exclusions`]). diff --git a/vortex-compressor/src/estimate.rs b/vortex-compressor/src/estimate.rs index 22a99fbccdc..9fbf434352a 100644 --- a/vortex-compressor/src/estimate.rs +++ b/vortex-compressor/src/estimate.rs @@ -23,12 +23,21 @@ use crate::trace; /// Closure type for [`DeferredEstimate::Callback`]. /// -/// The compressor calls this with the same arguments it would pass to sampling. The closure must -/// resolve directly to a terminal [`EstimateVerdict`]. +/// The compressor calls this with the same arguments it would pass to sampling, plus the best +/// [`EstimateScore`] observed so far (if any). The closure must resolve directly to a terminal +/// [`EstimateVerdict`]. +/// +/// The `best_so_far` threshold is an early-exit hint. If your scheme's maximum achievable +/// compression ratio is not strictly greater than +/// `best_so_far.and_then(EstimateScore::finite_ratio)`, you should return +/// [`EstimateVerdict::Skip`]. Returning a ratio equal to the threshold is permitted but will +/// lose to the prior best due to strict `>` tie-breaking in the selector. Use the threshold +/// only as an early-exit hint, never to perform additional work. #[rustfmt::skip] pub type EstimateFn = dyn FnOnce( &CascadingCompressor, &ArrayAndStats, + Option, CompressorContext, &mut ExecutionCtx, ) -> VortexResult @@ -85,17 +94,23 @@ pub enum DeferredEstimate { /// Use this only when the scheme needs to perform trial encoding or other costly checks to /// determine its compression ratio. The callback returns an [`EstimateVerdict`] directly, so /// it cannot request more sampling or another deferred callback. + /// + /// The compressor evaluates all immediate [`CompressionEstimate::Verdict`] results before + /// invoking any deferred callback, and passes the best [`EstimateScore`] observed so far to + /// the callback. This lets the callback return [`EstimateVerdict::Skip`] without performing + /// expensive work when its maximum achievable ratio cannot beat the current best. See + /// [`EstimateFn`] for the full contract. Callback(Box), } /// Ranked estimate used for comparing non-terminal compression candidates. #[derive(Debug, Clone, Copy, PartialEq)] -pub(super) enum EstimateScore { +pub enum EstimateScore { /// A finite compression ratio. Higher means a smaller amount of data, so it is better. FiniteCompression(f64), /// Trial compression produced a 0-byte output. /// - /// This has no finite trace ratio and is not eligible for scheme selection. + /// This has no finite ratio and is not eligible for scheme selection. /// /// TODO(connor): A zero-byte sample usually means the sampler happened to hit an all-null /// sample. Improve this logic so we can distinguish real zero-byte wins from sampling artifacts. @@ -112,8 +127,11 @@ impl EstimateScore { } } - /// Returns the traceable numeric ratio, omitting the zero-byte special case. - pub(super) fn trace_ratio(self) -> Option { + /// Returns the finite compression ratio, or [`None`] for the zero-byte special case. + /// + /// Callers comparing a scheme's maximum achievable ratio against a "best so far" threshold + /// should use this to extract a numeric value from an [`EstimateScore`]. + pub fn finite_ratio(self) -> Option { match self { Self::FiniteCompression(ratio) => Some(ratio), Self::ZeroBytes => None, @@ -156,7 +174,7 @@ impl WinnerEstimate { pub(super) fn trace_ratio(self) -> Option { match self { Self::AlwaysUse => None, - Self::Score(score) => score.trace_ratio(), + Self::Score(score) => score.finite_ratio(), } } } @@ -178,8 +196,8 @@ pub(super) fn is_better_score( /// /// Returns an error if sample compression fails. pub(super) fn estimate_compression_ratio_with_sampling( - scheme: &S, compressor: &CascadingCompressor, + scheme: &S, array: &ArrayRef, compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, @@ -212,7 +230,7 @@ pub(super) fn estimate_compression_ratio_with_sampling( // Single DEBUG event per sampled scheme. Downstream tooling can join this with the eventual // `scheme.compress_result` on the same scheme to compute sample-vs-full divergence. - trace::sample_result(scheme.id(), before, after, score.trace_ratio()); + trace::sample_result(scheme.id(), before, after, score.finite_ratio()); Ok(score) }