Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
245a140
Add a default FileStatisticsCache implementation for the ListingTable
mkleen Jan 18, 2026
4fe76c6
fixup! Add a default FileStatisticsCache implementation for the Listi…
mkleen Jan 28, 2026
2eaad48
Adapt memory usage when removing entries
mkleen Feb 4, 2026
2e66963
Adapt heapsize for &str
mkleen Feb 4, 2026
7d98830
Fix formatting
mkleen Feb 4, 2026
898f09d
Adapt heapsize for &str and add another scalarvalue
mkleen Feb 4, 2026
ede6227
Add better error message
mkleen Feb 10, 2026
bec99fc
Add todo to add heapsize for ordering in CachedFileMetadata
mkleen Feb 10, 2026
c967e16
Fix comment/docs on DefaultFileStatisticsCache
mkleen Feb 10, 2026
414e691
Simplify test data generation
mkleen Feb 10, 2026
d343a72
Remove potential stale entry, if entry is too large
mkleen Feb 10, 2026
b5c692a
Fix typo in sql logic test comment
mkleen Feb 10, 2026
5bf43ca
Fix comment about default behaviour in cache manager
mkleen Feb 10, 2026
4c00374
Fix variable name in test
mkleen Feb 10, 2026
d33e5c2
Fix variable name in test
mkleen Feb 10, 2026
df94981
Disable cache for sql logic test
mkleen Feb 10, 2026
41fad06
Include key into memory estimation
mkleen Feb 11, 2026
6c4ad5e
Fix fmt
mkleen Feb 11, 2026
a146bb2
Fix clippy
mkleen Feb 11, 2026
e4d4950
minor
mkleen Feb 11, 2026
9721a10
Add more key memory accounting
mkleen Feb 12, 2026
3a872a6
Fix Formatting
mkleen Feb 12, 2026
c12a7d0
Account path as string and remove dependency to object_store
mkleen Feb 12, 2026
be3fd2f
Improve error handling
mkleen Feb 12, 2026
be84c5a
Fix fmt
mkleen Feb 12, 2026
2a960cf
Remove path.clone
mkleen Feb 12, 2026
0dbad65
Simplify accounting for statistics
mkleen Feb 12, 2026
bebbb02
Adapt offset buffer
mkleen Feb 12, 2026
1841b6d
Fix heap size for Arc
mkleen Feb 12, 2026
ed3c2d6
Adapt estimate in test
mkleen Feb 12, 2026
6e3e7ad
Fix sql logic test
mkleen Feb 12, 2026
6ee43ef
Register cache from cachemanager at listing table
mkleen Apr 8, 2026
48d3434
Revert slt
mkleen Apr 8, 2026
f9e0367
Add tablescoping for file stats cache
mkleen Feb 18, 2026
a31f621
Adapt slt
mkleen Apr 9, 2026
c990ad5
Fix linter
mkleen Apr 9, 2026
4755544
Remove uneeded clone
mkleen Apr 9, 2026
3ef2c6d
Rename cache_unit to file_statistics_cache
mkleen Apr 9, 2026
95203fb
Simplify heap size accounting
mkleen Apr 9, 2026
0d5f64a
Adapt comments in test
mkleen Apr 10, 2026
f85ff80
Seperate drop table clean-ups
mkleen Apr 10, 2026
20c0862
fixup! Seperate drop table clean-ups
mkleen Apr 10, 2026
0ba637b
Increase default limit to 10 mb
mkleen Apr 15, 2026
d624084
Increase default limit to 20 mb
mkleen Apr 15, 2026
302536d
Fix comment
mkleen Apr 15, 2026
dbb7f2f
Fix deregister logic
mkleen Apr 15, 2026
82c2388
Fix slt
mkleen Apr 15, 2026
e96736d
Add table reference to FileStatisticsCacheEntry
mkleen Apr 15, 2026
5ada4dc
fixup! Add table reference to FileStatisticsCacheEntry
mkleen Apr 15, 2026
3d7296a
Fix comment
mkleen Apr 15, 2026
9fd2d64
Fix runtime_env entry
mkleen Apr 19, 2026
c757ec6
Add cache for all benchmark runs
mkleen Apr 21, 2026
9f6b4d4
Add cache to listing table creation
mkleen Apr 21, 2026
6f3da34
fixup! Add cache to listing table creation
mkleen Apr 21, 2026
f90bad0
Adapt limit to 20M in configs.md
mkleen Apr 22, 2026
b534d0c
fixup! Adapt limit to 20M in configs.md
mkleen Apr 22, 2026
f901cf3
Fix linter
mkleen Apr 22, 2026
4141fe9
Add cache to listing table in _read_type()
mkleen Apr 22, 2026
83d595e
Add ListView and LargeListView to heapsize
mkleen Apr 22, 2026
d0db35b
fixup! Add ListView and LargeListView to heapsize
mkleen Apr 22, 2026
f762dc8
Remove array.slt
mkleen Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/src/bin/external_aggr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,9 @@ impl ExternalAggrConfig {
let config = ListingTableConfig::new(table_path).with_listing_options(options);
let config = config.infer_schema(&state).await?;

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/imdb/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,9 @@ impl RunOpt {
_ => unreachable!(),
};

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/sort_pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,9 @@ impl RunOpt {
.with_listing_options(options)
.with_schema(schema);

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/sort_tpch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,9 @@ impl RunOpt {
.with_listing_options(options)
.with_schema(schema);

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/tpcds/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,9 @@ impl RunOpt {
.with_listing_options(options)
.with_schema(schema);

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,9 @@ impl RunOpt {
.with_listing_options(options)
.with_schema(schema);

Ok(Arc::new(ListingTable::try_new(config)?))
Ok(Arc::new(ListingTable::try_new(config)?.with_cache(
ctx.runtime_env().cache_manager.get_file_statistic_cache(),
)))
}

fn iterations(&self) -> usize {
Expand Down
58 changes: 1 addition & 57 deletions datafusion-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,10 +443,7 @@ mod tests {
use super::*;
use datafusion::{
common::test_util::batches_to_string,
execution::cache::{
DefaultListFilesCache, cache_manager::CacheManagerConfig,
cache_unit::DefaultFileStatisticsCache,
},
execution::cache::{DefaultListFilesCache, cache_manager::CacheManagerConfig},
prelude::{ParquetReadOptions, col, lit, split_part},
};
use insta::assert_snapshot;
Expand Down Expand Up @@ -656,8 +653,6 @@ mod tests {
Ok(())
}

/// Shows that the statistics cache is not enabled by default yet
/// See https://github.com/apache/datafusion/issues/19217
#[tokio::test]
async fn test_statistics_cache_default() -> Result<(), DataFusionError> {
let ctx = SessionContext::new();
Expand Down Expand Up @@ -687,57 +682,6 @@ mod tests {
.await?;
}

// When the cache manager creates a StatisticsCache by default,
// the contents will show up here
let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;
assert_snapshot!(batches_to_string(&rbs),@r"
++
++
");

Ok(())
}

// Can be removed when https://github.com/apache/datafusion/issues/19217 is resolved
#[tokio::test]
async fn test_statistics_cache_override() -> Result<(), DataFusionError> {
// Install a specific StatisticsCache implementation
let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
let cache_config = CacheManagerConfig::default()
.with_files_statistics_cache(Some(file_statistics_cache.clone()));
let runtime = RuntimeEnvBuilder::new()
.with_cache_manager(cache_config)
.build()?;
let config = SessionConfig::new().with_collect_statistics(true);
let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));

ctx.register_udtf(
"statistics_cache",
Arc::new(StatisticsCacheFunc::new(
ctx.task_ctx().runtime_env().cache_manager.clone(),
)),
);

for filename in [
"alltypes_plain",
"alltypes_tiny_pages",
"lz4_raw_compressed_larger",
] {
ctx.sql(
format!(
"create external table {filename}
stored as parquet
location '../parquet-testing/data/{filename}.parquet'",
)
.as_str(),
)
.await?
.collect()
.await?;
}

let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;
Expand Down
1 change: 1 addition & 0 deletions datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ fn try_into_partitioned_file(

let mut pf: PartitionedFile = object_meta.into();
pf.partition_values = partition_values;
pf.table_reference.clone_from(table_path.get_table_ref());

Ok(Some(pf))
}
Expand Down
35 changes: 19 additions & 16 deletions datafusion/catalog-listing/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ use datafusion_datasource::{
};
use datafusion_execution::cache::TableScopedPath;
use datafusion_execution::cache::cache_manager::FileStatisticsCache;
use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
use datafusion_expr::dml::InsertOp;
use datafusion_expr::execution_props::ExecutionProps;
use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
Expand Down Expand Up @@ -187,7 +186,7 @@ pub struct ListingTable {
/// The SQL definition for this table, if any
definition: Option<String>,
/// Cache for collected file statistics
collected_statistics: Arc<dyn FileStatisticsCache>,
collected_statistics: Option<Arc<dyn FileStatisticsCache>>,
/// Constraints applied to this table
constraints: Constraints,
/// Column default expressions for columns that are not physically present in the data files
Expand Down Expand Up @@ -231,7 +230,7 @@ impl ListingTable {
schema_source,
options,
definition: None,
collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
collected_statistics: None,
constraints: Constraints::default(),
column_defaults: HashMap::new(),
expr_adapter_factory: config.expr_adapter_factory,
Expand Down Expand Up @@ -260,10 +259,8 @@ impl ListingTable {
/// Setting a statistics cache on the `SessionContext` can avoid refetching statistics
/// multiple times in the same session.
///
/// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query.
pub fn with_cache(mut self, cache: Option<Arc<dyn FileStatisticsCache>>) -> Self {
self.collected_statistics =
cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default()));
self.collected_statistics = cache;
self
}

Expand Down Expand Up @@ -802,11 +799,15 @@ impl ListingTable {
) -> datafusion_common::Result<(Arc<Statistics>, Option<LexOrdering>)> {
use datafusion_execution::cache::cache_manager::CachedFileMetadata;

let path = &part_file.object_meta.location;
let path = TableScopedPath {
table: part_file.table_reference.clone(),
path: part_file.object_meta.location.clone(),
};
let meta = &part_file.object_meta;

// Check cache first - if we have valid cached statistics and ordering
if let Some(cached) = self.collected_statistics.get(path)
if let Some(cache) = &self.collected_statistics
&& let Some(cached) = cache.get(&path)
&& cached.is_valid_for(meta)
{
// Return cached statistics and ordering
Expand All @@ -823,14 +824,16 @@ impl ListingTable {
let statistics = Arc::new(file_meta.statistics);

// Store in cache
self.collected_statistics.put(
path,
CachedFileMetadata::new(
meta.clone(),
Arc::clone(&statistics),
file_meta.ordering.clone(),
),
);
if let Some(cache) = &self.collected_statistics {
cache.put(
&path,
CachedFileMetadata::new(
meta.clone(),
Arc::clone(&statistics),
file_meta.ordering.clone(),
),
);
}

Ok((statistics, file_meta.ordering))
}
Expand Down
Loading
Loading