From 51b38bb4c386ae6056d0398492bbaf5ac077edd0 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:58:20 -0500 Subject: [PATCH] Add docstring examples for Aggregate basic and bitwise/boolean functions Add example usage to docstrings for Aggregate basic and bitwise/boolean functions to improve documentation. Co-Authored-By: Claude Opus 4.6 --- python/datafusion/functions.py | 141 +++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index fd116254b..adbcf905f 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1894,6 +1894,15 @@ def approx_distinct( Args: expression: Values to check for distinct entries filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() >= 2 + True """ filter_raw = filter.expr if filter is not None else None @@ -1912,6 +1921,15 @@ def approx_median(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Values to find the median for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.approx_median(expression.expr, filter=filter_raw)) @@ -1943,6 +1961,15 @@ def approx_percentile_cont( percentile: This must be between 0.0 and 1.0, inclusive num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -1975,6 +2002,15 @@ def approx_percentile_cont_with_weight( num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"), + ... dfn.col("w"), 0.5).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -2038,6 +2074,14 @@ def avg( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.avg(expression.expr, filter=filter_raw)) @@ -2076,6 +2120,14 @@ def count( expressions: Argument to perform bitwise calculation on distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -2140,6 +2192,14 @@ def max(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the maximum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.max(expression.expr, filter=filter_raw)) @@ -2149,6 +2209,14 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the average (mean) value of the argument. This is an alias for :py:func:`avg`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return avg(expression, filter) @@ -2168,6 +2236,14 @@ def median( expression: The value to compute the median of distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw)) @@ -2182,6 +2258,14 @@ def min(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1 """ filter_raw = filter.expr if filter is not None else None return Expr(f.min(expression.expr, filter=filter_raw)) @@ -2201,6 +2285,14 @@ def sum( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.sum(expression.expr, filter=filter_raw)) @@ -2618,6 +2710,14 @@ def bit_and(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [7, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_and(expression.expr, filter=filter_raw)) @@ -2634,6 +2734,14 @@ def bit_or(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2]}) + >>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_or(expression.expr, filter=filter_raw)) @@ -2653,6 +2761,14 @@ def bit_xor( expression: Argument to perform bitwise calculation on distinct: If True, evaluate each unique value of expression only once filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [5, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw)) @@ -2670,6 +2786,14 @@ def bool_and(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [True, True, False]}) + >>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + False """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_and(expression.expr, filter=filter_raw)) @@ -2687,6 +2811,14 @@ def bool_or(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [False, False, True]}) + >>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + True """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_or(expression.expr, filter=filter_raw)) @@ -3077,6 +3209,15 @@ def string_agg( For example:: df.aggregate([], string_agg(col("a"), ",", order_by="b")) + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["x", "y", "z"]}) + >>> result = df.aggregate( + ... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")]) + >>> result.collect_column("s")[0].as_py() + 'x,y,z' """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None