Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
361d881
feat: reorder row groups by statistics during sort pushdown
zhuqi-lucas Apr 13, 2026
fc9e0b4
test: add SLT tests for row group reorder by statistics
zhuqi-lucas Apr 13, 2026
57ebc3d
test: add EXPLAIN assertions for row group reorder tests
zhuqi-lucas Apr 13, 2026
66a9185
fix: use max statistics for DESC sort reorder
zhuqi-lucas Apr 13, 2026
213fb2c
fix: prevent reorder+reverse double-reordering of row groups
zhuqi-lucas Apr 14, 2026
6c56600
fix: rebase conflicts and compilation errors
zhuqi-lucas Apr 16, 2026
815481b
refactor: introduce AccessPlanOptimizer trait for row group reordering
zhuqi-lucas Apr 16, 2026
439a5d6
chore: remove benchmark from this PR (tracked in #21582)
zhuqi-lucas Apr 16, 2026
6ee488e
fix: resolve doc link for AccessPlanOptimizer
zhuqi-lucas Apr 17, 2026
832a541
fix: restore benchmark files from upstream main
zhuqi-lucas Apr 17, 2026
bdd86a0
fix: compose reorder and reverse as sequential steps instead of mutua…
zhuqi-lucas Apr 18, 2026
c3dac5e
fix: generate scrambled+overlapping RGs for overlap benchmark
zhuqi-lucas Apr 18, 2026
f340c65
feat: reorder files in shared work queue by statistics for TopK
zhuqi-lucas Apr 20, 2026
19a51fc
feat: initialize TopK dynamic filter threshold from parquet statistics
zhuqi-lucas Apr 18, 2026
e230d2f
feat: enable file reorder and RG reorder for all TopK queries
zhuqi-lucas Apr 20, 2026
cfdacf0
perf: move stats init before RG pruning so first file also benefits
zhuqi-lucas Apr 20, 2026
5199d9f
fix: restrict RG reorder/reverse to sort pushdown path only
zhuqi-lucas Apr 20, 2026
b2e93bc
perf: move stats init before PruningPredicate build + fix CastExpr un…
zhuqi-lucas Apr 21, 2026
45fb5d7
fix: null-aware filter + restrict stats init to sort pushdown path
zhuqi-lucas Apr 21, 2026
e490d8e
feat: enable stats init for ALL TopK queries + fix fuzz test tiebreaker
zhuqi-lucas Apr 21, 2026
5cc6a98
fix: restrict stats init to sort pushdown path to avoid over-pruning
zhuqi-lucas Apr 21, 2026
9bec94a
fix: stats init only safe for sorted (non-overlapping) RGs
zhuqi-lucas Apr 21, 2026
c0e6659
feat: enable stats init for pure TopK queries (no WHERE clause)
zhuqi-lucas Apr 21, 2026
0ca20d8
fix: stats init requires sort pushdown + no WHERE clause
zhuqi-lucas Apr 21, 2026
9ad381d
feat: TopK cumulative RG pruning after reorder (works with WHERE)
zhuqi-lucas Apr 21, 2026
40ec907
feat: enable RG reorder + cumulative prune for all TopK queries
zhuqi-lucas Apr 21, 2026
ef6123a
fix: only reverse/cumulate when reorder succeeds (prevents ClickBench…
zhuqi-lucas Apr 21, 2026
f76c5bc
fix: escape brackets in doc comment to fix rustdoc link error
zhuqi-lucas Apr 21, 2026
f91c5b9
chore: remove benchmark and listing_table_partitions changes from thi…
zhuqi-lucas Apr 21, 2026
743b846
refactor: remove stats init in favor of cumulative RG pruning + add S…
zhuqi-lucas Apr 21, 2026
f7c42d8
fix: cumulative prune only without WHERE to avoid under-returning rows
zhuqi-lucas Apr 21, 2026
3325610
feat: restore stats init with fixes (GtEq + df.fetch() + type cast)
zhuqi-lucas Apr 21, 2026
76f63aa
fix: SortExec.fetch was 0 when create_filter was called
zhuqi-lucas Apr 21, 2026
93a8668
perf: skip RG reorder when sort column not in file schema
zhuqi-lucas Apr 22, 2026
8ded1f4
fix: use slt:ignore for non-deterministic output_rows_skew metric
zhuqi-lucas Apr 22, 2026
138bd8e
fix: skip cumulative prune when row_selection exists + cleanup
zhuqi-lucas Apr 22, 2026
7acb19a
feat: support multi-key ORDER BY + truncate row_selection safety
zhuqi-lucas Apr 22, 2026
d725d84
test: add multi-key Inexact sort pushdown test (Test M)
zhuqi-lucas Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 44 additions & 110 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ clickbench_extended: ClickBench \"inspired\" queries against a single parquet
# Sort Pushdown Benchmarks
sort_pushdown: Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
sort_pushdown_sorted: Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
sort_pushdown_inexact: Sort pushdown Inexact path (--sorted DESC) — multi-file with scrambled RGs, tests reverse scan + RG reorder
sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — same data, tests Unsupported path + RG reorder
sort_pushdown_inexact_overlap: Sort pushdown Inexact path — multi-file scrambled RGs (streaming data scenario)
sort_pushdown_inexact: Sort pushdown Inexact path (--sorted DESC) — tests reverse scan + RG reorder
sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — tests Unsupported path + RG reorder
sort_pushdown_inexact_overlap: Sort pushdown Inexact path — partially overlapping RGs (streaming data scenario)

# Sorted Data Benchmarks (ORDER BY Optimization)
clickbench_sorted: ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
Expand Down Expand Up @@ -1154,150 +1154,84 @@ run_sort_pushdown_sorted() {

# Generates data for sort pushdown Inexact benchmark.
#
# Produces multiple parquet files where each file has MULTIPLE row groups
# with scrambled RG order. This tests both:
# - Row-group-level reorder within each file (reorder_by_statistics)
# - TopK threshold initialization from RG statistics
#
# Strategy:
# 1. Write a single sorted file with small (100K-row) RGs (~61 RGs total).
# 2. Use pyarrow to redistribute RGs into N_FILES files, scrambling the
# RG order within each file using a deterministic permutation.
# Each file gets ~61/N_FILES RGs with narrow, non-overlapping ranges
# but in scrambled order.
#
# Writing a single file with ORDER BY scramble does NOT work: the parquet
# writer merges rows from adjacent chunks at RG boundaries, widening
# ranges and defeating reorder_by_statistics.
#
# Requires pyarrow (pip install pyarrow).
# Produces a single large lineitem parquet file where row groups have
# NON-OVERLAPPING but OUT-OF-ORDER l_orderkey ranges (each RG internally
# sorted, RGs shuffled). This simulates append-heavy workloads where data
# is written in batches at different times.
data_sort_pushdown_inexact() {
INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet 2>/dev/null)" ]; then
echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
return
fi

# Check pyarrow dependency (needed to split/scramble RGs)
if ! python3 -c "import pyarrow" 2>/dev/null; then
echo "Error: pyarrow is required for sort pushdown Inexact data generation."
echo "Install with: pip install pyarrow"
return 1
fi

echo "Generating sort pushdown Inexact benchmark data (multi-file, scrambled RGs)..."
echo "Generating sort pushdown Inexact benchmark data (single file, shuffled RGs)..."

# Re-use the sort_pushdown data as the source (generate if missing)
data_sort_pushdown

mkdir -p "${INEXACT_DIR}"
SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"

# Step 1: Write a single sorted file with small (100K-row) RGs
TMPFILE="${INEXACT_DIR}/_sorted_small_rgs.parquet"
# Use datafusion-cli to bucket rows into 64 groups by a deterministic
# scrambler, then sort within each bucket by orderkey. This produces
# ~64 RG-sized segments where each has a tight orderkey range but the
# segments appear in scrambled (non-sorted) order in the file.
(cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
CREATE EXTERNAL TABLE src
STORED AS PARQUET
LOCATION '${SRC_DIR}';

COPY (SELECT * FROM src ORDER BY l_orderkey)
TO '${TMPFILE}'
COPY (
SELECT * FROM src
ORDER BY
(l_orderkey * 1664525 + 1013904223) % 64,
l_orderkey
)
TO '${INEXACT_DIR}/shuffled.parquet'
STORED AS PARQUET
OPTIONS ('format.max_row_group_size' '100000');
")

# Step 2: Redistribute RGs into 3 files with scrambled RG order.
# Each file gets ~20 RGs. RG assignment: rg_idx % 3 determines file,
# permutation (rg_idx * 41 + 7) % n scrambles the order within file.
python3 -c "
import pyarrow.parquet as pq

pf = pq.ParquetFile('${TMPFILE}')
n = pf.metadata.num_row_groups
n_files = 3

# Assign each RG to a file, scramble order within each file
file_rgs = [[] for _ in range(n_files)]
for rg_idx in range(n):
slot = (rg_idx * 41 + 7) % n # scrambled index
file_id = slot % n_files
file_rgs[file_id].append(rg_idx)

# Write each file with its assigned RGs (in scrambled order)
for file_id in range(n_files):
rgs = file_rgs[file_id]
if not rgs:
continue
tables = [pf.read_row_group(rg) for rg in rgs]
writer = pq.ParquetWriter(
'${INEXACT_DIR}/part_%03d.parquet' % file_id,
pf.schema_arrow)
for t in tables:
writer.write_table(t)
writer.close()
print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
"

rm -f "${TMPFILE}"
echo "Sort pushdown Inexact data generated at ${INEXACT_DIR}"
echo "Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR}"
ls -la "${INEXACT_DIR}"

# Also generate overlap data: same strategy but with different file count
# and permutation. Simulates streaming data with network delays where
# chunks arrive out of sequence.
#
# Requires pyarrow (pip install pyarrow).
# Also generate a file with partially overlapping row groups.
# Simulates streaming data with network delays: each chunk is mostly
# in order but has a small overlap with the next chunk (±5% of the
# chunk range). This is the pattern described by @adriangb — data
# arriving with timestamps that are generally increasing but with
# network-induced jitter causing small overlaps between row groups.
OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap/lineitem"
if [ -d "${OVERLAP_DIR}" ] && [ "$(ls -A ${OVERLAP_DIR}/*.parquet 2>/dev/null)" ]; then
echo "Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR}"
return
fi

echo "Generating sort pushdown Inexact overlap data (multi-file, scrambled RGs)..."
echo "Generating sort pushdown Inexact overlap data (partially overlapping RGs)..."
mkdir -p "${OVERLAP_DIR}"

# Step 1: Write a single sorted file with small (100K-row) RGs
TMPFILE="${OVERLAP_DIR}/_sorted_small_rgs.parquet"
(cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
CREATE EXTERNAL TABLE src
STORED AS PARQUET
LOCATION '${SRC_DIR}';

COPY (SELECT * FROM src ORDER BY l_orderkey)
TO '${TMPFILE}'
-- Add jitter to l_orderkey: shift each row by a random-ish offset
-- proportional to its position. This creates overlap between adjacent
-- row groups while preserving the general ascending trend.
-- Formula: l_orderkey + (l_orderkey * 7 % 5000) - 2500
-- This adds ±2500 jitter, creating ~5K overlap between adjacent 100K-row RGs.
COPY (
SELECT * FROM src
ORDER BY l_orderkey + (l_orderkey * 7 % 5000) - 2500
)
TO '${OVERLAP_DIR}/overlapping.parquet'
STORED AS PARQUET
OPTIONS ('format.max_row_group_size' '100000');
")

# Step 2: Redistribute into 5 files with scrambled RG order.
python3 -c "
import pyarrow.parquet as pq

pf = pq.ParquetFile('${TMPFILE}')
n = pf.metadata.num_row_groups
n_files = 5

file_rgs = [[] for _ in range(n_files)]
for rg_idx in range(n):
slot = (rg_idx * 37 + 13) % n
file_id = slot % n_files
file_rgs[file_id].append(rg_idx)

for file_id in range(n_files):
rgs = file_rgs[file_id]
if not rgs:
continue
tables = [pf.read_row_group(rg) for rg in rgs]
writer = pq.ParquetWriter(
'${OVERLAP_DIR}/part_%03d.parquet' % file_id,
pf.schema_arrow)
for t in tables:
writer.write_table(t)
writer.close()
print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
"

rm -f "${TMPFILE}"
echo "Sort pushdown Inexact overlap data generated at ${OVERLAP_DIR}"
ls -la "${OVERLAP_DIR}"
}

# Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
Expand All @@ -1306,7 +1240,7 @@ for file_id in range(n_files):
run_sort_pushdown_inexact() {
INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact.json"
echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, --sorted DESC)..."
echo "Running sort pushdown Inexact benchmark (--sorted, DESC, reverse scan path)..."
DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}
Expand All @@ -1322,13 +1256,13 @@ run_sort_pushdown_inexact_unsorted() {
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_unsorted" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}

# Runs the sort pushdown benchmark with multi-file scrambled RG order.
# Simulates streaming data with network delaysmultiple files, each with
# scrambled RGs. Tests both RG-level reorder and TopK stats initialization.
# Runs the sort pushdown benchmark with partially overlapping RGs.
# Simulates streaming data with network jitterRGs are mostly in order
# but have small overlaps (±2500 orderkey jitter between adjacent RGs).
run_sort_pushdown_inexact_overlap() {
OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact_overlap.json"
echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, streaming data pattern)..."
echo "Running sort pushdown Inexact benchmark (overlapping RGs, streaming data pattern)..."
DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${OVERLAP_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_overlap" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}
Expand Down
25 changes: 20 additions & 5 deletions datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,21 +305,36 @@ async fn test_fuzz_topk_filter_pushdown() {
}

let mut queries = vec![];
let all_columns = ["id", "name", "department"];

for limit in [1, 10] {
for num_order_by_columns in [1, 2, 3] {
for order_columns in ["id", "name", "department"]
.iter()
.combinations(num_order_by_columns)
{
for order_columns in all_columns.iter().combinations(num_order_by_columns) {
for orderings in order_columns
.iter()
.map(|col| orders.get(**col).unwrap())
.multi_cartesian_product()
{
// Add remaining columns as ASC tiebreakers to make
// the ordering fully deterministic. Without this,
// optimizations that change RG read order (e.g.
// statistics-based pruning) may produce different
// but equally valid tie-breaking results.
let used: Vec<&str> = order_columns.iter().map(|c| **c).collect();
let tiebreakers: Vec<String> = all_columns
.iter()
.filter(|c| !used.contains(*c))
.map(|c| format!("{c} ASC NULLS LAST"))
.collect();
let mut all_orderings: Vec<&str> =
orderings.iter().map(|s| s.as_str()).collect();
let tiebreaker_refs: Vec<&str> =
tiebreakers.iter().map(|s| s.as_str()).collect();
all_orderings.extend(tiebreaker_refs);

let query = format!(
"SELECT * FROM test_table ORDER BY {} LIMIT {}",
orderings.into_iter().join(", "),
all_orderings.join(", "),
limit
);
queries.push(query);
Expand Down
Loading
Loading