Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 204 additions & 16 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,17 @@ def chr(arg: Expr) -> Expr:


def coalesce(*args: Expr) -> Expr:
"""Returns the value of the first expr in ``args`` which is not NULL."""
"""Returns the value of the first expr in ``args`` which is not NULL.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]})
>>> result = df.select(
... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c"))
>>> result.collect_column("c")[0].as_py()
2
"""
args = [arg.expr for arg in args]
return Expr(f.coalesce(*args))

Expand Down Expand Up @@ -820,7 +830,16 @@ def ltrim(arg: Expr) -> Expr:


def md5(arg: Expr) -> Expr:
"""Computes an MD5 128-bit checksum for a string expression."""
"""Computes an MD5 128-bit checksum for a string expression.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello"]})
>>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5"))
>>> result.collect_column("md5")[0].as_py()
'5d41402abc4b2a76b9719d911017c592'
"""
return Expr(f.md5(arg.expr))


Expand All @@ -830,7 +849,18 @@ def nanvl(x: Expr, y: Expr) -> Expr:


def nvl(x: Expr, y: Expr) -> Expr:
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``."""
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]})
>>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl"))
>>> nvl_df.collect_column("nvl")[0].as_py()
0
>>> nvl_df.collect_column("nvl")[1].as_py()
1
"""
return Expr(f.nvl(x.expr, y.expr))


Expand Down Expand Up @@ -899,21 +929,45 @@ def radians(arg: Expr) -> Expr:


def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
"""Find if any regular expression (regex) matches exist.
r"""Find if any regular expression (regex) matches exist.

Tests a string using a regular expression returning true if at least one match,
false otherwise.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello123"]})
>>> result = df.select(
... dfn.functions.regexp_like(
... dfn.col("a"), dfn.lit("\\d+")
... ).alias("m")
... )
>>> result.collect_column("m")[0].as_py()
True
"""
if flags is not None:
flags = flags.expr
return Expr(f.regexp_like(string.expr, regex.expr, flags))


def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
"""Perform regular expression (regex) matching.
r"""Perform regular expression (regex) matching.

Returns an array with each element containing the leftmost-first match of the
corresponding index in ``regex`` to string in ``string``.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
>>> result = df.select(
... dfn.functions.regexp_match(
... dfn.col("a"), dfn.lit("(\\d+)")
... ).alias("m")
... )
>>> result.collect_column("m")[0].as_py()
['42']
"""
if flags is not None:
flags = flags.expr
Expand All @@ -923,13 +977,26 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
def regexp_replace(
string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None
) -> Expr:
"""Replaces substring(s) matching a PCRE-like regular expression.
r"""Replaces substring(s) matching a PCRE-like regular expression.

The full list of supported features and syntax can be found at
<https://docs.rs/regex/latest/regex/#syntax>

Supported flags with the addition of 'g' can be found at
<https://docs.rs/regex/latest/regex/#grouping-and-flags>

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello 42"]})
>>> result = df.select(
... dfn.functions.regexp_replace(
... dfn.col("a"), dfn.lit("\\d+"),
... dfn.lit("XX")
... ).alias("r")
... )
>>> result.collect_column("r")[0].as_py()
'hello XX'
"""
if flags is not None:
flags = flags.expr
Expand All @@ -943,6 +1010,15 @@ def regexp_count(

Optional start position (the first position is 1) to search for the regular
expression.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["abcabc"]})
>>> result = df.select(
... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c"))
>>> result.collect_column("c")[0].as_py()
2
"""
if flags is not None:
flags = flags.expr
Expand All @@ -958,12 +1034,24 @@ def regexp_instr(
flags: Expr | None = None,
sub_expr: Expr | None = None,
) -> Expr:
"""Returns the position of a regular expression match in a string.
r"""Returns the position of a regular expression match in a string.

Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
``start`` (the first position is 1). Returns the starting or ending position based
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
return the position of a specific capture group instead of the entire match.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
>>> result = df.select(
... dfn.functions.regexp_instr(
... dfn.col("a"), dfn.lit("\\d+")
... ).alias("pos")
... )
>>> result.collect_column("pos")[0].as_py()
7
"""
start = start.expr if start is not None else None
n = n.expr if n is not None else None
Expand Down Expand Up @@ -1030,22 +1118,66 @@ def rtrim(arg: Expr) -> Expr:


def sha224(arg: Expr) -> Expr:
"""Computes the SHA-224 hash of a binary string."""
"""Computes the SHA-224 hash of a binary string.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello"]})
>>> result = df.select(
... dfn.functions.sha224(dfn.col("a")).alias("h")
... )
>>> len(result.collect_column("h")[0].as_py()) > 0
True
"""
return Expr(f.sha224(arg.expr))


def sha256(arg: Expr) -> Expr:
"""Computes the SHA-256 hash of a binary string."""
"""Computes the SHA-256 hash of a binary string.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello"]})
>>> result = df.select(
... dfn.functions.sha256(dfn.col("a")).alias("h")
... )
>>> len(result.collect_column("h")[0].as_py()) > 0
True
"""
return Expr(f.sha256(arg.expr))


def sha384(arg: Expr) -> Expr:
"""Computes the SHA-384 hash of a binary string."""
"""Computes the SHA-384 hash of a binary string.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello"]})
>>> result = df.select(
... dfn.functions.sha384(dfn.col("a")).alias("h")
... )
>>> len(result.collect_column("h")[0].as_py()) > 0
True
"""
return Expr(f.sha384(arg.expr))


def sha512(arg: Expr) -> Expr:
"""Computes the SHA-512 hash of a binary string."""
"""Computes the SHA-512 hash of a binary string.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["hello"]})
>>> result = df.select(
... dfn.functions.sha512(dfn.col("a")).alias("h")
... )
>>> len(result.collect_column("h")[0].as_py()) > 0
True
"""
return Expr(f.sha512(arg.expr))


Expand Down Expand Up @@ -1370,18 +1502,55 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr:


def uuid() -> Expr:
"""Returns uuid v4 as a string value."""
"""Returns uuid v4 as a string value.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1]})
>>> result = df.select(
... dfn.functions.uuid().alias("u")
... )
>>> len(result.collect_column("u")[0].as_py()) == 36
True
"""
return Expr(f.uuid())


def struct(*args: Expr) -> Expr:
"""Returns a struct with the given arguments."""
"""Returns a struct with the given arguments.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1], "b": [2]})
>>> result = df.select(
... dfn.functions.struct(
... dfn.col("a"), dfn.col("b")
... ).alias("s")
... )
>>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2}
True
"""
args = [arg.expr for arg in args]
return Expr(f.struct(*args))


def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr:
"""Returns a struct with the given names and arguments pairs."""
"""Returns a struct with the given names and arguments pairs.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1]})
>>> result = df.select(
... dfn.functions.named_struct(
... [("x", dfn.lit(10)), ("y", dfn.lit(20))]
... ).alias("s")
... )
>>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20}
True
"""
name_pair_exprs = [
[Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]]
for pair in name_pairs
Expand All @@ -1398,12 +1567,31 @@ def from_unixtime(arg: Expr) -> Expr:


def arrow_typeof(arg: Expr) -> Expr:
"""Returns the Arrow type of the expression."""
"""Returns the Arrow type of the expression.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1]})
>>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t"))
>>> result.collect_column("t")[0].as_py()
'Int64'
"""
return Expr(f.arrow_typeof(arg.expr))


def arrow_cast(expr: Expr, data_type: Expr) -> Expr:
"""Casts an expression to a specified data type."""
"""Casts an expression to a specified data type.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1]})
>>> data_type = dfn.string_literal("Float64")
>>> result = df.select(dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c"))
>>> result.collect_column("c")[0].as_py()
1.0
"""
return Expr(f.arrow_cast(expr.expr, data_type.expr))


Expand Down
Loading