diff --git a/datasketches/src/hll/list.rs b/datasketches/src/hll/list.rs index 1f0b3e7..689e42c 100644 --- a/datasketches/src/hll/list.rs +++ b/datasketches/src/hll/list.rs @@ -82,13 +82,17 @@ impl List { empty: bool, compact: bool, ) -> Result { - // Compute array size - let array_size = if compact { coupon_count } else { 1 << lg_arr }; - - // Read coupons + // Always allocate the full-sized array (1 << lg_arr) so Coupon::EMPTY sentinel + // slots are available for future update() calls. In compact format only + // coupon_count values are stored on disk, but memory must hold the full capacity + // so the linear scan in update() can find an empty slot to insert into. + let array_size = 1 << lg_arr; + let read_count = if compact { coupon_count } else { array_size }; + + // Read coupons into the front of the full-sized array; remaining slots stay Coupon::EMPTY. let mut coupons = vec![Coupon::EMPTY; array_size]; if !empty && coupon_count > 0 { - for (i, coupon) in coupons.iter_mut().enumerate() { + for (i, coupon) in coupons.iter_mut().take(read_count).enumerate() { let raw = cursor.read_u32_le().map_err(|_| { Error::insufficient_data(format!( "expect {coupon_count} coupons, failed at index {i}" diff --git a/datasketches/tests/hll_serialization_test.rs b/datasketches/tests/hll_serialization_test.rs index a7e00e6..4f7291b 100644 --- a/datasketches/tests/hll_serialization_test.rs +++ b/datasketches/tests/hll_serialization_test.rs @@ -22,6 +22,7 @@ use std::path::PathBuf; use common::serialization_test_data; use datasketches::hll::HllSketch; +use datasketches::hll::HllType; fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u8) { let expected = expected_cardinality as f64; @@ -102,6 +103,33 @@ fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u ); } +/// Reproducer for https://github.com/apache/datasketches-rust/issues/115 +/// +/// A compact-serialized List has no trailing COUPON_EMPTY (0) sentinels. +/// Before the fix, update() would scan the fully-packed array, find no +/// empty slot, and silently drop the new value. +#[test] +fn test_update_after_deserialize_list_mode() { + const LG_K: u8 = 11; + for hll_type in [HllType::Hll4, HllType::Hll6, HllType::Hll8] { + let mut sketch = HllSketch::new(LG_K, hll_type); + sketch.update(1u64); + + // Round-trip through serialization (compact format, List mode) + let bytes = sketch.serialize(); + let mut sketch = HllSketch::deserialize(&bytes).unwrap(); + + // This update was silently dropped before the fix + sketch.update(2u64); + + let est = sketch.estimate(); + assert!( + (est - 2.0).abs() < 0.1, + "{hll_type:?}: expected estimate close to 2.0 after update post-deserialize, got {est}" + ); + } +} + #[test] fn test_java_hll4_compatibility() { let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];