apache · zhuqi-lucas · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -109,9 +109,9 @@ clickbench_extended:    ClickBench \"inspired\" queries against a single parquet
 # Sort Pushdown Benchmarks
 sort_pushdown:          Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
 sort_pushdown_sorted:   Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
-sort_pushdown_inexact:  Sort pushdown Inexact path (--sorted DESC) — multi-file with scrambled RGs, tests reverse scan + RG reorder
-sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — same data, tests Unsupported path + RG reorder
-sort_pushdown_inexact_overlap: Sort pushdown Inexact path — multi-file scrambled RGs (streaming data scenario)
+sort_pushdown_inexact:  Sort pushdown Inexact path (--sorted DESC) — tests reverse scan + RG reorder
+sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — tests Unsupported path + RG reorder
+sort_pushdown_inexact_overlap: Sort pushdown Inexact path — partially overlapping RGs (streaming data scenario)
 
 # Sorted Data Benchmarks (ORDER BY Optimization)
 clickbench_sorted:     ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
@@ -1154,150 +1154,84 @@ run_sort_pushdown_sorted() {
 
 # Generates data for sort pushdown Inexact benchmark.
 #
-# Produces multiple parquet files where each file has MULTIPLE row groups
-# with scrambled RG order. This tests both:
-#   - Row-group-level reorder within each file (reorder_by_statistics)
-#   - TopK threshold initialization from RG statistics
-#
-# Strategy:
-# 1. Write a single sorted file with small (100K-row) RGs (~61 RGs total).
-# 2. Use pyarrow to redistribute RGs into N_FILES files, scrambling the
-#    RG order within each file using a deterministic permutation.
-#    Each file gets ~61/N_FILES RGs with narrow, non-overlapping ranges
-#    but in scrambled order.
-#
-# Writing a single file with ORDER BY scramble does NOT work: the parquet
-# writer merges rows from adjacent chunks at RG boundaries, widening
-# ranges and defeating reorder_by_statistics.
-#
-# Requires pyarrow (pip install pyarrow).
+# Produces a single large lineitem parquet file where row groups have
+# NON-OVERLAPPING but OUT-OF-ORDER l_orderkey ranges (each RG internally
+# sorted, RGs shuffled). This simulates append-heavy workloads where data
+# is written in batches at different times.
 data_sort_pushdown_inexact() {
     INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
     if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet 2>/dev/null)" ]; then
         echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
         return
     fi
 
-    # Check pyarrow dependency (needed to split/scramble RGs)
-    if ! python3 -c "import pyarrow" 2>/dev/null; then
-        echo "Error: pyarrow is required for sort pushdown Inexact data generation."
-        echo "Install with: pip install pyarrow"
-        return 1
-    fi
-
-    echo "Generating sort pushdown Inexact benchmark data (multi-file, scrambled RGs)..."
+    echo "Generating sort pushdown Inexact benchmark data (single file, shuffled RGs)..."
 
     # Re-use the sort_pushdown data as the source (generate if missing)
     data_sort_pushdown
 
     mkdir -p "${INEXACT_DIR}"
     SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"
 
-    # Step 1: Write a single sorted file with small (100K-row) RGs
-    TMPFILE="${INEXACT_DIR}/_sorted_small_rgs.parquet"
+    # Use datafusion-cli to bucket rows into 64 groups by a deterministic
+    # scrambler, then sort within each bucket by orderkey. This produces
+    # ~64 RG-sized segments where each has a tight orderkey range but the
+    # segments appear in scrambled (non-sorted) order in the file.
     (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
         CREATE EXTERNAL TABLE src
         STORED AS PARQUET
         LOCATION '${SRC_DIR}';
 
-        COPY (SELECT * FROM src ORDER BY l_orderkey)
-        TO '${TMPFILE}'
+        COPY (
+            SELECT * FROM src
+            ORDER BY
+                (l_orderkey * 1664525 + 1013904223) % 64,
+                l_orderkey
+        )
+        TO '${INEXACT_DIR}/shuffled.parquet'
         STORED AS PARQUET
         OPTIONS ('format.max_row_group_size' '100000');
     ")
 
-    # Step 2: Redistribute RGs into 3 files with scrambled RG order.
-    # Each file gets ~20 RGs. RG assignment: rg_idx % 3 determines file,
-    # permutation (rg_idx * 41 + 7) % n scrambles the order within file.
-    python3 -c "
-import pyarrow.parquet as pq
-
-pf = pq.ParquetFile('${TMPFILE}')
-n = pf.metadata.num_row_groups
-n_files = 3
-
-# Assign each RG to a file, scramble order within each file
-file_rgs = [[] for _ in range(n_files)]
-for rg_idx in range(n):
-    slot = (rg_idx * 41 + 7) % n  # scrambled index
-    file_id = slot % n_files
-    file_rgs[file_id].append(rg_idx)
-
-# Write each file with its assigned RGs (in scrambled order)
-for file_id in range(n_files):
-    rgs = file_rgs[file_id]
-    if not rgs:
-        continue
-    tables = [pf.read_row_group(rg) for rg in rgs]
-    writer = pq.ParquetWriter(
-        '${INEXACT_DIR}/part_%03d.parquet' % file_id,
-        pf.schema_arrow)
-    for t in tables:
-        writer.write_table(t)
-    writer.close()
-    print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
-"
-
-    rm -f "${TMPFILE}"
-    echo "Sort pushdown Inexact data generated at ${INEXACT_DIR}"
+    echo "Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR}"
     ls -la "${INEXACT_DIR}"
 
-    # Also generate overlap data: same strategy but with different file count
-    # and permutation. Simulates streaming data with network delays where
-    # chunks arrive out of sequence.
-    #
-    # Requires pyarrow (pip install pyarrow).
+    # Also generate a file with partially overlapping row groups.
+    # Simulates streaming data with network delays: each chunk is mostly
+    # in order but has a small overlap with the next chunk (±5% of the
+    # chunk range). This is the pattern described by @adriangb — data
+    # arriving with timestamps that are generally increasing but with
+    # network-induced jitter causing small overlaps between row groups.
     OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap/lineitem"
     if [ -d "${OVERLAP_DIR}" ] && [ "$(ls -A ${OVERLAP_DIR}/*.parquet 2>/dev/null)" ]; then
         echo "Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR}"
         return
     fi
 
-    echo "Generating sort pushdown Inexact overlap data (multi-file, scrambled RGs)..."
+    echo "Generating sort pushdown Inexact overlap data (partially overlapping RGs)..."
     mkdir -p "${OVERLAP_DIR}"
 
-    # Step 1: Write a single sorted file with small (100K-row) RGs
-    TMPFILE="${OVERLAP_DIR}/_sorted_small_rgs.parquet"
     (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
         CREATE EXTERNAL TABLE src
         STORED AS PARQUET
         LOCATION '${SRC_DIR}';
 
-        COPY (SELECT * FROM src ORDER BY l_orderkey)
-        TO '${TMPFILE}'
+        -- Add jitter to l_orderkey: shift each row by a random-ish offset
+        -- proportional to its position. This creates overlap between adjacent
+        -- row groups while preserving the general ascending trend.
+        -- Formula: l_orderkey + (l_orderkey * 7 % 5000) - 2500
+        -- This adds ±2500 jitter, creating ~5K overlap between adjacent 100K-row RGs.
+        COPY (
+            SELECT * FROM src
+            ORDER BY l_orderkey + (l_orderkey * 7 % 5000) - 2500
+        )
+        TO '${OVERLAP_DIR}/overlapping.parquet'
         STORED AS PARQUET
         OPTIONS ('format.max_row_group_size' '100000');
     ")
 
-    # Step 2: Redistribute into 5 files with scrambled RG order.
-    python3 -c "
-import pyarrow.parquet as pq
-
-pf = pq.ParquetFile('${TMPFILE}')
-n = pf.metadata.num_row_groups
-n_files = 5
-
-file_rgs = [[] for _ in range(n_files)]
-for rg_idx in range(n):
-    slot = (rg_idx * 37 + 13) % n
-    file_id = slot % n_files
-    file_rgs[file_id].append(rg_idx)
-
-for file_id in range(n_files):
-    rgs = file_rgs[file_id]
-    if not rgs:
-        continue
-    tables = [pf.read_row_group(rg) for rg in rgs]
-    writer = pq.ParquetWriter(
-        '${OVERLAP_DIR}/part_%03d.parquet' % file_id,
-        pf.schema_arrow)
-    for t in tables:
-        writer.write_table(t)
-    writer.close()
-    print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
-"
-
-    rm -f "${TMPFILE}"
+    echo "Sort pushdown Inexact overlap data generated at ${OVERLAP_DIR}"
+    ls -la "${OVERLAP_DIR}"
 }
 
 # Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
@@ -1306,7 +1240,7 @@ for file_id in range(n_files):
 run_sort_pushdown_inexact() {
     INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
     RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact.json"
-    echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, --sorted DESC)..."
+    echo "Running sort pushdown Inexact benchmark (--sorted, DESC, reverse scan path)..."
     DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
     debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
@@ -1322,13 +1256,13 @@ run_sort_pushdown_inexact_unsorted() {
     debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_unsorted" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
-# Runs the sort pushdown benchmark with multi-file scrambled RG order.
-# Simulates streaming data with network delays — multiple files, each with
-# scrambled RGs. Tests both RG-level reorder and TopK stats initialization.
+# Runs the sort pushdown benchmark with partially overlapping RGs.
+# Simulates streaming data with network jitter — RGs are mostly in order
+# but have small overlaps (±2500 orderkey jitter between adjacent RGs).
 run_sort_pushdown_inexact_overlap() {
     OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap"
     RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact_overlap.json"
-    echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, streaming data pattern)..."
+    echo "Running sort pushdown Inexact benchmark (overlapping RGs, streaming data pattern)..."
     DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
     debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${OVERLAP_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_overlap" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }

diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
@@ -305,21 +305,36 @@ async fn test_fuzz_topk_filter_pushdown() {
     }
 
     let mut queries = vec![];
+    let all_columns = ["id", "name", "department"];
 
     for limit in [1, 10] {
         for num_order_by_columns in [1, 2, 3] {
-            for order_columns in ["id", "name", "department"]
-                .iter()
-                .combinations(num_order_by_columns)
-            {
+            for order_columns in all_columns.iter().combinations(num_order_by_columns) {
                 for orderings in order_columns
                     .iter()
                     .map(|col| orders.get(**col).unwrap())
                     .multi_cartesian_product()
                 {
+                    // Add remaining columns as ASC tiebreakers to make
+                    // the ordering fully deterministic. Without this,
+                    // optimizations that change RG read order (e.g.
+                    // statistics-based pruning) may produce different
+                    // but equally valid tie-breaking results.
+                    let used: Vec<&str> = order_columns.iter().map(|c| **c).collect();
+                    let tiebreakers: Vec<String> = all_columns
+                        .iter()
+                        .filter(|c| !used.contains(*c))
+                        .map(|c| format!("{c} ASC NULLS LAST"))
+                        .collect();
+                    let mut all_orderings: Vec<&str> =
+                        orderings.iter().map(|s| s.as_str()).collect();
+                    let tiebreaker_refs: Vec<&str> =
+                        tiebreakers.iter().map(|s| s.as_str()).collect();
+                    all_orderings.extend(tiebreaker_refs);
+
                     let query = format!(
                         "SELECT * FROM test_table ORDER BY {} LIMIT {}",
-                        orderings.into_iter().join(", "),
+                        all_orderings.join(", "),
                         limit
                     );
                     queries.push(query);