diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 014f356cd64cd..2ada0411f4f8c 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -402,7 +402,7 @@ async fn test_fn_approx_median() -> Result<()> { +-----------------------+ | approx_median(test.b) | +-----------------------+ - | 10 | + | 10.0 | +-----------------------+ "); @@ -422,7 +422,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +---------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] | +---------------------------------------------------------------------------+ - | 10 | + | 10.0 | +---------------------------------------------------------------------------+ "); @@ -437,7 +437,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +----------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] | +----------------------------------------------------------------------------+ - | 100 | + | 100.0 | +----------------------------------------------------------------------------+ "); @@ -457,7 +457,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +--------------------------------------------------------------------+ | approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] | +--------------------------------------------------------------------+ - | 10 | + | 10.0 | +--------------------------------------------------------------------+ " ); @@ -477,7 +477,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +---------------------------------------------------------------------+ | approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] | +---------------------------------------------------------------------+ - | 100 | + | 100.0 | +---------------------------------------------------------------------+ " ); @@ -494,7 +494,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +------------------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] | +------------------------------------------------------------------------------------+ - | 30 | + | 30.25 | +------------------------------------------------------------------------------------+ "); @@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +-------------------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] | +-------------------------------------------------------------------------------------+ - | 69 | + | 69.85 | +-------------------------------------------------------------------------------------+ "); diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 26de1e23b38e1..e0830754399db 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1204,26 +1204,26 @@ async fn window_using_aggregates() -> Result<()> { | first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 | +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ | | | | | | | | 1 | -85 | - | -85 | -101 | 14 | -12 | -12 | 83 | -101 | 4 | -54 | - | -85 | -101 | 17 | -25 | -25 | 83 | -101 | 5 | -31 | - | -85 | -12 | 10 | -32 | -34 | 83 | -85 | 3 | 13 | - | -85 | -25 | 3 | -56 | -56 | -25 | -85 | 1 | -5 | - | -85 | -31 | 18 | -29 | -28 | 83 | -101 | 5 | 36 | - | -85 | -38 | 16 | -25 | -25 | 83 | -101 | 4 | 65 | - | -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 | - | -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 | - | -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 | - | -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 | - | -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 | - | -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 | - | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | - | -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 | - | -85 | 13 | 11 | -25 | -25 | 83 | -85 | 3 | 13 | - | -85 | 14 | 12 | -12 | -12 | 83 | -85 | 3 | 17 | - | -85 | 17 | 13 | -11 | -8 | 83 | -85 | 4 | -101 | - | -85 | 45 | 8 | -34 | -34 | 83 | -85 | 3 | -72 | - | -85 | 65 | 17 | -17 | -18 | 83 | -101 | 5 | -101 | - | -85 | 83 | 5 | -25 | -25 | 83 | -85 | 2 | -48 | + | -85 | -101 | 14 | -12.0 | -12 | 83 | -101 | 4 | -54 | + | -85 | -101 | 17 | -25.0 | -25 | 83 | -101 | 5 | -31 | + | -85 | -12 | 10 | -32.75 | -34 | 83 | -85 | 3 | 13 | + | -85 | -25 | 3 | -56.0 | -56 | -25 | -85 | 1 | -5 | + | -85 | -31 | 18 | -29.75 | -28 | 83 | -101 | 5 | 36 | + | -85 | -38 | 16 | -25.0 | -25 | 83 | -101 | 4 | 65 | + | -85 | -43 | 7 | -43.0 | -43 | 83 | -85 | 2 | 45 | + | -85 | -48 | 6 | -35.75 | -36 | 83 | -85 | 2 | -43 | + | -85 | -5 | 4 | -37.75 | -40 | -5 | -85 | 1 | 83 | + | -85 | -54 | 15 | -17.0 | -18 | 83 | -101 | 4 | -38 | + | -85 | -56 | 2 | -70.5 | -70 | -56 | -85 | 1 | -25 | + | -85 | -72 | 9 | -43.0 | -43 | 83 | -85 | 3 | -12 | + | -85 | -85 | 1 | -85.0 | -85 | -85 | -85 | 1 | -56 | + | -85 | 13 | 11 | -17.0 | -18 | 83 | -85 | 3 | 14 | + | -85 | 13 | 11 | -25.0 | -25 | 83 | -85 | 3 | 13 | + | -85 | 14 | 12 | -12.0 | -12 | 83 | -85 | 3 | 17 | + | -85 | 17 | 13 | -11.25 | -8 | 83 | -85 | 4 | -101 | + | -85 | 45 | 8 | -34.5 | -34 | 83 | -85 | 3 | -72 | + | -85 | 65 | 17 | -17.0 | -18 | 83 | -101 | 5 | -101 | + | -85 | 83 | 5 | -25.0 | -25 | 83 | -85 | 2 | -48 | +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ " ); diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 42d4de939a8e8..3e941f00c2ee3 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -21,7 +21,6 @@ use std::fmt::Display; use std::hash::Hash; use std::sync::Arc; -use crate::type_coercion::aggregates::NUMERICS; use arrow::datatypes::{ DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType, Decimal128Type, DecimalType, Field, IntervalUnit, TimeUnit, @@ -596,6 +595,20 @@ impl Display for ArrayFunctionArgument { } } +static NUMERICS: &[DataType] = &[ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Float16, + DataType::Float32, + DataType::Float64, +]; + impl TypeSignature { pub fn to_string_repr(&self) -> Vec { match self { diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs index df86ff582d658..ada0bd26b8d06 100644 --- a/datafusion/expr-common/src/type_coercion/aggregates.rs +++ b/datafusion/expr-common/src/type_coercion/aggregates.rs @@ -20,8 +20,7 @@ use arrow::datatypes::{DataType, FieldRef}; use datafusion_common::{Result, internal_err, plan_err}; -// TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures -// see https://github.com/apache/datafusion/issues/18092 +#[deprecated(since = "54.0.0", note = "Use functions signatures")] pub static INTEGERS: &[DataType] = &[ DataType::Int8, DataType::Int16, @@ -33,6 +32,7 @@ pub static INTEGERS: &[DataType] = &[ DataType::UInt64, ]; +#[deprecated(since = "54.0.0", note = "Use functions signatures")] pub static NUMERICS: &[DataType] = &[ DataType::Int8, DataType::Int16, diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs index 3be03260a9761..a1f29b649b2f8 100644 --- a/datafusion/expr/src/test/function_stub.rs +++ b/datafusion/expr/src/test/function_stub.rs @@ -29,13 +29,14 @@ use datafusion_common::plan_err; use datafusion_common::{Result, exec_err, not_impl_err, utils::take_function_args}; use crate::Volatility::Immutable; -use crate::type_coercion::aggregates::NUMERICS; use crate::{ - Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature, + Accumulator, AggregateUDFImpl, Coercion, Expr, GroupsAccumulator, ReversedUDAF, + Signature, TypeSignature, TypeSignatureClass, expr::AggregateFunction, function::{AccumulatorArgs, StateFieldsArgs}, utils::AggregateOrderSensitivity, }; +use datafusion_common::types::{NativeType, logical_float64}; macro_rules! create_func { ($UDAF:ty, $AGGREGATE_UDF_FN:ident) => { @@ -444,9 +445,22 @@ pub struct Avg { impl Avg { pub fn new() -> Self { + let signature = Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Decimal, + )]), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Integer, TypeSignatureClass::Float], + NativeType::Float64, + )]), + ], + Immutable, + ); Self { aliases: vec![String::from("mean")], - signature: Signature::uniform(1, NUMERICS.to_vec(), Immutable), + signature, } } } diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index c016ada695716..162dc224f2ccb 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -74,16 +74,11 @@ impl ApproxMedian { pub fn new() -> Self { Self { signature: Signature::one_of( - vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Integer, - )]), - TypeSignature::Coercible(vec![Coercion::new_implicit( - TypeSignatureClass::Float, - vec![TypeSignatureClass::Decimal], - NativeType::Float64, - )]), - ], + vec![TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + )])], Volatility::Immutable, ), } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 5e9e5beb0cb4b..3f1adcca12362 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -23,23 +23,20 @@ use arrow::array::{Array, Float16Array}; use arrow::compute::{filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, - Int64Array, UInt8Array, UInt16Array, UInt32Array, UInt64Array, - }, + array::{ArrayRef, Float32Array, Float64Array}, datatypes::{DataType, Field}, }; +use datafusion_common::types::{NativeType, logical_float64}; use datafusion_common::{ DataFusionError, Result, ScalarValue, downcast_value, internal_err, not_impl_err, plan_err, }; use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; -use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, - Volatility, + Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature, + TypeSignature, TypeSignatureClass, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{DEFAULT_MAX_SIZE, TDigest}; use datafusion_macros::user_doc; @@ -132,22 +129,44 @@ impl Default for ApproxPercentileCont { impl ApproxPercentileCont { /// Create a new [`ApproxPercentileCont`] aggregate function. pub fn new() -> Self { - let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1)); // Accept any numeric value paired with a float64 percentile - for num in NUMERICS { - variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64])); - // Additionally accept an integer number of centroids for T-Digest - for int in INTEGERS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - DataType::Float64, - int.clone(), - ])) - } - } - Self { - signature: Signature::one_of(variants, Volatility::Immutable), - } + let signature = Signature::one_of( + vec![ + // 2 args - numeric, percentile (float) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + ]), + // 3 args - numeric, percentile (float), number of centroid for T-Digest (integer) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Integer, + vec![TypeSignatureClass::Numeric], + NativeType::Int64, + ), + ]), + ], + Volatility::Immutable, + ); + Self { signature } } pub(crate) fn create_accumulator( @@ -177,17 +196,7 @@ impl ApproxPercentileCont { let data_type = args.expr_fields[0].data_type(); let accumulator: ApproxPercentileAccumulator = match data_type { - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 => { + DataType::Float16 | DataType::Float32 | DataType::Float64 => { if let Some(max_size) = tdigest_max_size { ApproxPercentileAccumulator::new_with_max_size( percentile, @@ -374,38 +383,6 @@ impl ApproxPercentileAccumulator { .map(|v| v.to_f64()) .collect::>()) } - DataType::Int64 => { - let array = downcast_value!(values, Int64Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int32 => { - let array = downcast_value!(values, Int32Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int16 => { - let array = downcast_value!(values, Int16Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int8 => { - let array = downcast_value!(values, Int8Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt64 => { - let array = downcast_value!(values, UInt64Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt32 => { - let array = downcast_value!(values, UInt32Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt16 => { - let array = downcast_value!(values, UInt16Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt8 => { - let array = downcast_value!(values, UInt8Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } e => internal_err!( "APPROX_PERCENTILE_CONT is not expected to receive the type {e:?}" ), @@ -439,14 +416,6 @@ impl Accumulator for ApproxPercentileAccumulator { // These acceptable return types MUST match the validation in // ApproxPercentile::create_accumulator. Ok(match &self.return_type { - DataType::Int8 => ScalarValue::Int8(Some(q as i8)), - DataType::Int16 => ScalarValue::Int16(Some(q as i16)), - DataType::Int32 => ScalarValue::Int32(Some(q as i32)), - DataType::Int64 => ScalarValue::Int64(Some(q as i64)), - DataType::UInt8 => ScalarValue::UInt8(Some(q as u8)), - DataType::UInt16 => ScalarValue::UInt16(Some(q as u16)), - DataType::UInt32 => ScalarValue::UInt32(Some(q as u32)), - DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)), DataType::Float16 => ScalarValue::Float16(Some(half::f16::from_f64(q))), DataType::Float32 => ScalarValue::Float32(Some(q as f32)), DataType::Float64 => ScalarValue::Float64(Some(q)), diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 81a4787dd522f..6ada47fb38040 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -24,13 +24,13 @@ use arrow::compute::{and, filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{array::ArrayRef, datatypes::DataType}; use datafusion_common::ScalarValue; +use datafusion_common::types::{NativeType, logical_float64}; use datafusion_common::{Result, not_impl_err, plan_err}; -use datafusion_expr::Volatility::Immutable; use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; -use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, + Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature, + TypeSignature, TypeSignatureClass, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{Centroid, TDigest}; use datafusion_macros::user_doc; @@ -125,26 +125,54 @@ impl Default for ApproxPercentileContWithWeight { impl ApproxPercentileContWithWeight { /// Create a new [`ApproxPercentileContWithWeight`] aggregate function. pub fn new() -> Self { - let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1)); - // Accept any numeric value paired with weight and float64 percentile - for num in NUMERICS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - num.clone(), - DataType::Float64, - ])); - // Additionally accept an integer number of centroids for T-Digest - for int in INTEGERS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - num.clone(), - DataType::Float64, - int.clone(), - ])); - } - } + let signature = Signature::one_of( + vec![ + // 3 args - numeric, weight (float), percentile (float) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + ]), + // 4 args - numeric, weight (float), percentile (float), centroid (integer) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Integer, + vec![TypeSignatureClass::Numeric], + NativeType::Int64, + ), + ]), + ], + Volatility::Immutable, + ); Self { - signature: Signature::one_of(variants, Immutable), + signature, approx_percentile_cont: ApproxPercentileCont::new(), } } diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/functions-aggregate/src/utils.rs index 5e1925fcdbb5d..6d816e54bdaf2 100644 --- a/datafusion/functions-aggregate/src/utils.rs +++ b/datafusion/functions-aggregate/src/utils.rs @@ -54,6 +54,11 @@ pub(crate) fn validate_percentile_expr( let percentile = match scalar_value { ScalarValue::Float32(Some(value)) => value as f64, ScalarValue::Float64(Some(value)) => value, + ScalarValue::Float32(None) | ScalarValue::Float64(None) => { + return plan_err!( + "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got null)" + ); + } sv => { return plan_err!( "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got data type {})", diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 253428288ff49..bb17d13f5c769 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1912,7 +1912,10 @@ mod test { .err() .unwrap() .strip_backtrace(); - assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from Utf8 to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) failed")); + assert!( + err.contains("Function 'avg' failed to match any signature"), + "Err: {err:?}" + ); Ok(()) } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 962d1d510395e..00ca0482a31e1 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -174,28 +174,22 @@ statement error DataFusion error: Schema error: Schema contains duplicate unqual SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_weight -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c1) FROM aggregate_test_100 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c1, 0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_histogram_bins statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function +statement error Function 'approx_percentile_cont' failed to match any signature SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int16, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 - -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 - statement error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 @@ -948,16 +942,16 @@ SELECT c2, var_samp(CASE WHEN c12 > 0.90 THEN c12 ELSE null END) FROM aggregate_ # csv_query_approx_median_1 -query I +query R SELECT approx_median(c2) FROM aggregate_test_100 ---- 3 # csv_query_approx_median_2 -query I +query R SELECT approx_median(c6) FROM aggregate_test_100 ---- -1146409980542786560 +1146409980542786600 # csv_query_approx_median_3 query R @@ -1006,7 +1000,7 @@ SELECT median(col_i8), median(distinct col_i8) FROM median_table -14 100 # approx_distinct_median_i8 -query I +query R SELECT approx_median(distinct col_i8) FROM median_table ---- 100 @@ -2032,26 +2026,29 @@ SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c11) AS true # percentile_cont_with_nulls -query I +query R SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); ---- 2 # percentile_cont_with_nulls_only -query I +query R SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); ---- NULL +query error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal \(got null\) +SELECT APPROX_PERCENTILE_CONT(NULL) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); + # percentile_cont_with_weight_with_nulls -query I +query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1, 1), (2, 1), (3, 1), (4, NULL), (NULL, 1), (NULL, NULL)) as t (v, w); ---- 2 # percentile_cont_with_weight_nulls_only -query I +query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(1, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); ---- NULL @@ -2332,36 +2329,36 @@ b 5 NULL 20135.4 b NULL NULL 7732.315789473684 # csv_query_approx_percentile_cont_with_weight -query TI +query TR SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # csv_query_approx_percentile_cont_with_weight (should be the same as above) -query TI +query TR SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # using approx_percentile_cont on 2 columns with same signature -query TII +query TRR SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 5 73 +a 5 73.55 b 5 68 -c 5 122 -d 5 124 -e 5 115 +c 5 122.5 +d 5 124.2 +e 5 115.6 # error is unique to this UDAF query TRR @@ -2375,73 +2372,82 @@ e 3 40.333333333333 -query TI +query TR SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -114.3 +c -109.475 +d -98.6 +e -93.65 # csv_query_approx_percentile_cont_with_weight (2) -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # csv_query_approx_percentile_cont_with_weight alternate syntax -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 - -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -114.3 +c -109.475 +d -98.6 +e -93.65 # csv_query_approx_percentile_cont_with_histogram_bins -query TI +query TR SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 -query TI +query TR +SELECT c1, approx_percentile_cont(0.95, 200.1) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 73.55 +b 68 +c 122.5 +d 124.2 +e 115.6 + + +query TR SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 65 b 68 c 122 -d 123 -e 110 +d 123.15 +e 110.266666666667 # approx_percentile_cont_with_weight with centroids -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(c2, 0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 65 b 68 c 122 -d 123 -e 110 +d 123.15 +e 110.266666666667 # csv_query_sum_crossjoin query TTI @@ -3834,10 +3840,10 @@ SELECT COUNT(DISTINCT c1) FROM test # TODO: aggregate_with_alias # test_approx_percentile_cont_decimal_support -query TI +query TR SELECT c1, approx_percentile_cont(cast(0.85 as decimal(10,2))) WITHIN GROUP (ORDER BY c2) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 4 +a 4.175 b 5 c 4 d 4 @@ -5978,7 +5984,7 @@ select median(a) from (select 1 as a where 1=0); ---- NULL -query I +query R select approx_median(a) from (select 1 as a where 1=0); ---- NULL diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index e12ac5782e3a4..cd95426a9b0bb 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -145,7 +145,7 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5; -2117946883 d -2117946883 NULL NULL NULL -2098805236 c -2098805236 NULL NULL NULL -query ITIIII +query ITRRRR SELECT c5, c1, APPROX_MEDIAN(c5), APPROX_MEDIAN(CASE WHEN c1 = 'a' THEN c5 ELSE NULL END), @@ -318,14 +318,14 @@ SELECT c2, median(c5), median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY 5 604973998 0.49842384 # Test approx_median for int / float -query IIR +query IRR SELECT c2, approx_median(c5), approx_median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; ---- -1 191655437 0.59926736 +1 191655437.25 0.59926736 2 -587831330 0.43230486 3 240273900 0.40199697 4 762932956 0.48515016 -5 593204320 0.5156586 +5 593204320.5 0.5156586 # Test approx_distinct for varchar / int query III @@ -399,14 +399,14 @@ SELECT c2, median(c3), median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDE 5 -35 0.5536642 # Test approx_median with nullable fields -query IIR +query IRR SELECT c2, approx_median(c3), approx_median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- 1 12 0.6067944 2 1 0.46076488 3 14 0.40154034 -4 -7 0.48515016 -5 -39 0.5536642 +4 -7.75 0.48515016 +5 -39.75 0.5536642 # Test approx_distinct with nullable fields query II @@ -516,7 +516,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 5 64 -59 # Test approx_median with filter -query III +query IRR SELECT c2, approx_median(c3) FILTER (WHERE c3 > 0), @@ -526,7 +526,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 1 57 -56 2 52 -60 3 71 -76 -4 65 -64 +4 65 -64.75 5 64 -59 # Test count with nullable fields and filter @@ -662,20 +662,20 @@ FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; 5 -22 # Test approx_median with nullable fields and filter -query IIR +query IRR SELECT c2, approx_median(c3) FILTER (WHERE c5 > 0), approx_median(c11) FILTER (WHERE c5 < 0) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- 1 -5 0.6623719 -2 12 0.52930677 +2 12.25 0.52930677 3 13 0.32792538 4 -38 0.49774808 -5 -21 0.47652745 +5 -21.75 0.47652745 # Test approx_median with nullable fields and nullable filter -query II +query IR SELECT c2, approx_median(c3) FILTER (WHERE c11 > 0.5) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; diff --git a/datafusion/sqllogictest/test_files/clickbench_extended.slt b/datafusion/sqllogictest/test_files/clickbench_extended.slt index ee3e33551ee3e..6b0d78cdba8f3 100644 --- a/datafusion/sqllogictest/test_files/clickbench_extended.slt +++ b/datafusion/sqllogictest/test_files/clickbench_extended.slt @@ -52,7 +52,7 @@ query IIIIII SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; ---- -query IIIIII +query IIIIRI SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; ---- diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 42589481f909e..f584e3c342271 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -83,7 +83,7 @@ select count(distinct name) from table_with_metadata; 2 # Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687 -query I +query R select approx_median(distinct id) from table_with_metadata; ---- 2 diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index d5fe197854c4d..c277f69d0bee2 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -321,3 +321,29 @@ clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text, behavior is unchanged. [#17861]: https://github.com/apache/datafusion/pull/17861 + +### `approx_percentile_cont`, `approx_percentile_cont_with_weight`, `approx_median` now coerce to floats + +The type signatures of `approx_percentile_cont`, `approx_percentile_cont_with_weight`, and +`approx_median` now coerce integer input values to `Float64` before computing the approximation. +As a result, these functions always return a float, even when the input column is an integer type. + +**Who is affected:** + +- Queries or downstream code that relied on `approx_percentile_cont` / `approx_percentile_cont_with_weight` / + `approx_median` returning an integer type when given an integer column. + +**Migration guide:** + +If downstream code checks or relies on the return type being an integer, add an explicit +`CAST` back to the desired integer type, or update the type assertion: + +```sql +-- Before (returned Int64): +SELECT approx_percentile_cont(quantity, 0.5) FROM orders; + +-- After (returns Float64); cast if an integer result is required: +SELECT CAST(approx_percentile_cont(quantity, 0.5) AS BIGINT) FROM orders; +``` + +[#21074]: https://github.com/apache/datafusion/pull/21074