From dc02681af8bf55dbb7149bafffd551decfd4e340 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Mon, 13 Apr 2026 16:31:12 +0200 Subject: [PATCH] fix(hll): update silently dropped after deserializing a compact List sketch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an `HllSketch` in **List mode** is serialized it uses compact format: only the live coupons are written to disk, with no trailing `COUPON_EMPTY` sentinels. On deserialization, the container was allocated with exactly `coupon_count` slots, leaving the array fully packed. `list::update()` scans linearly for a `COUPON_EMPTY` (`0`) sentinel to find an insertion slot. Finding none, it fell through silently — discarding every value added after the round-trip. Always allocate `1 << lg_arr` slots (initialized to `COUPON_EMPTY`) in `List::deserialize()`, reading only `coupon_count` elements from the byte stream in compact mode. The trailing empty slots are then available as sentinels for subsequent `update()` calls. Fixes #115. --- datasketches/src/hll/list.rs | 14 ++++++---- datasketches/tests/hll_serialization_test.rs | 28 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/datasketches/src/hll/list.rs b/datasketches/src/hll/list.rs index 1f0b3e7..689e42c 100644 --- a/datasketches/src/hll/list.rs +++ b/datasketches/src/hll/list.rs @@ -82,13 +82,17 @@ impl List { empty: bool, compact: bool, ) -> Result { - // Compute array size - let array_size = if compact { coupon_count } else { 1 << lg_arr }; - - // Read coupons + // Always allocate the full-sized array (1 << lg_arr) so Coupon::EMPTY sentinel + // slots are available for future update() calls. In compact format only + // coupon_count values are stored on disk, but memory must hold the full capacity + // so the linear scan in update() can find an empty slot to insert into. + let array_size = 1 << lg_arr; + let read_count = if compact { coupon_count } else { array_size }; + + // Read coupons into the front of the full-sized array; remaining slots stay Coupon::EMPTY. let mut coupons = vec![Coupon::EMPTY; array_size]; if !empty && coupon_count > 0 { - for (i, coupon) in coupons.iter_mut().enumerate() { + for (i, coupon) in coupons.iter_mut().take(read_count).enumerate() { let raw = cursor.read_u32_le().map_err(|_| { Error::insufficient_data(format!( "expect {coupon_count} coupons, failed at index {i}" diff --git a/datasketches/tests/hll_serialization_test.rs b/datasketches/tests/hll_serialization_test.rs index a7e00e6..4f7291b 100644 --- a/datasketches/tests/hll_serialization_test.rs +++ b/datasketches/tests/hll_serialization_test.rs @@ -22,6 +22,7 @@ use std::path::PathBuf; use common::serialization_test_data; use datasketches::hll::HllSketch; +use datasketches::hll::HllType; fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u8) { let expected = expected_cardinality as f64; @@ -102,6 +103,33 @@ fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u ); } +/// Reproducer for https://github.com/apache/datasketches-rust/issues/115 +/// +/// A compact-serialized List has no trailing COUPON_EMPTY (0) sentinels. +/// Before the fix, update() would scan the fully-packed array, find no +/// empty slot, and silently drop the new value. +#[test] +fn test_update_after_deserialize_list_mode() { + const LG_K: u8 = 11; + for hll_type in [HllType::Hll4, HllType::Hll6, HllType::Hll8] { + let mut sketch = HllSketch::new(LG_K, hll_type); + sketch.update(1u64); + + // Round-trip through serialization (compact format, List mode) + let bytes = sketch.serialize(); + let mut sketch = HllSketch::deserialize(&bytes).unwrap(); + + // This update was silently dropped before the fix + sketch.update(2u64); + + let est = sketch.estimate(); + assert!( + (est - 2.0).abs() < 0.1, + "{hll_type:?}: expected estimate close to 2.0 after update post-deserialize, got {est}" + ); + } +} + #[test] fn test_java_hll4_compatibility() { let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];